diff options
Diffstat (limited to 'rand/src/distributions/utils.rs')
-rw-r--r-- | rand/src/distributions/utils.rs | 504 |
1 files changed, 504 insertions, 0 deletions
diff --git a/rand/src/distributions/utils.rs b/rand/src/distributions/utils.rs new file mode 100644 index 0000000..a2112fd --- /dev/null +++ b/rand/src/distributions/utils.rs @@ -0,0 +1,504 @@ +// Copyright 2018 Developers of the Rand project. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Math helper functions + +#[cfg(feature="simd_support")] +use packed_simd::*; +#[cfg(feature="std")] +use distributions::ziggurat_tables; +#[cfg(feature="std")] +use Rng; + + +pub trait WideningMultiply<RHS = Self> { + type Output; + + fn wmul(self, x: RHS) -> Self::Output; +} + +macro_rules! wmul_impl { + ($ty:ty, $wide:ty, $shift:expr) => { + impl WideningMultiply for $ty { + type Output = ($ty, $ty); + + #[inline(always)] + fn wmul(self, x: $ty) -> Self::Output { + let tmp = (self as $wide) * (x as $wide); + ((tmp >> $shift) as $ty, tmp as $ty) + } + } + }; + + // simd bulk implementation + ($(($ty:ident, $wide:ident),)+, $shift:expr) => { + $( + impl WideningMultiply for $ty { + type Output = ($ty, $ty); + + #[inline(always)] + fn wmul(self, x: $ty) -> Self::Output { + // For supported vectors, this should compile to a couple + // supported multiply & swizzle instructions (no actual + // casting). + // TODO: optimize + let y: $wide = self.cast(); + let x: $wide = x.cast(); + let tmp = y * x; + let hi: $ty = (tmp >> $shift).cast(); + let lo: $ty = tmp.cast(); + (hi, lo) + } + } + )+ + }; +} +wmul_impl! { u8, u16, 8 } +wmul_impl! { u16, u32, 16 } +wmul_impl! { u32, u64, 32 } +#[cfg(rust_1_26)] +wmul_impl! { u64, u128, 64 } + +// This code is a translation of the __mulddi3 function in LLVM's +// compiler-rt. It is an optimised variant of the common method +// `(a + b) * (c + d) = ac + ad + bc + bd`. +// +// For some reason LLVM can optimise the C version very well, but +// keeps shuffling registers in this Rust translation. +macro_rules! wmul_impl_large { + ($ty:ty, $half:expr) => { + impl WideningMultiply for $ty { + type Output = ($ty, $ty); + + #[inline(always)] + fn wmul(self, b: $ty) -> Self::Output { + const LOWER_MASK: $ty = !0 >> $half; + let mut low = (self & LOWER_MASK).wrapping_mul(b & LOWER_MASK); + let mut t = low >> $half; + low &= LOWER_MASK; + t += (self >> $half).wrapping_mul(b & LOWER_MASK); + low += (t & LOWER_MASK) << $half; + let mut high = t >> $half; + t = low >> $half; + low &= LOWER_MASK; + t += (b >> $half).wrapping_mul(self & LOWER_MASK); + low += (t & LOWER_MASK) << $half; + high += t >> $half; + high += (self >> $half).wrapping_mul(b >> $half); + + (high, low) + } + } + }; + + // simd bulk implementation + (($($ty:ty,)+) $scalar:ty, $half:expr) => { + $( + impl WideningMultiply for $ty { + type Output = ($ty, $ty); + + #[inline(always)] + fn wmul(self, b: $ty) -> Self::Output { + // needs wrapping multiplication + const LOWER_MASK: $scalar = !0 >> $half; + let mut low = (self & LOWER_MASK) * (b & LOWER_MASK); + let mut t = low >> $half; + low &= LOWER_MASK; + t += (self >> $half) * (b & LOWER_MASK); + low += (t & LOWER_MASK) << $half; + let mut high = t >> $half; + t = low >> $half; + low &= LOWER_MASK; + t += (b >> $half) * (self & LOWER_MASK); + low += (t & LOWER_MASK) << $half; + high += t >> $half; + high += (self >> $half) * (b >> $half); + + (high, low) + } + } + )+ + }; +} +#[cfg(not(rust_1_26))] +wmul_impl_large! { u64, 32 } +#[cfg(rust_1_26)] +wmul_impl_large! { u128, 64 } + +macro_rules! wmul_impl_usize { + ($ty:ty) => { + impl WideningMultiply for usize { + type Output = (usize, usize); + + #[inline(always)] + fn wmul(self, x: usize) -> Self::Output { + let (high, low) = (self as $ty).wmul(x as $ty); + (high as usize, low as usize) + } + } + } +} +#[cfg(target_pointer_width = "32")] +wmul_impl_usize! { u32 } +#[cfg(target_pointer_width = "64")] +wmul_impl_usize! { u64 } + +#[cfg(all(feature = "simd_support", feature = "nightly"))] +mod simd_wmul { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + use super::*; + + wmul_impl! { + (u8x2, u16x2), + (u8x4, u16x4), + (u8x8, u16x8), + (u8x16, u16x16), + (u8x32, u16x32),, + 8 + } + + wmul_impl! { (u16x2, u32x2),, 16 } + #[cfg(not(target_feature = "sse2"))] + wmul_impl! { (u16x4, u32x4),, 16 } + #[cfg(not(target_feature = "sse4.2"))] + wmul_impl! { (u16x8, u32x8),, 16 } + #[cfg(not(target_feature = "avx2"))] + wmul_impl! { (u16x16, u32x16),, 16 } + + // 16-bit lane widths allow use of the x86 `mulhi` instructions, which + // means `wmul` can be implemented with only two instructions. + #[allow(unused_macros)] + macro_rules! wmul_impl_16 { + ($ty:ident, $intrinsic:ident, $mulhi:ident, $mullo:ident) => { + impl WideningMultiply for $ty { + type Output = ($ty, $ty); + + #[inline(always)] + fn wmul(self, x: $ty) -> Self::Output { + let b = $intrinsic::from_bits(x); + let a = $intrinsic::from_bits(self); + let hi = $ty::from_bits(unsafe { $mulhi(a, b) }); + let lo = $ty::from_bits(unsafe { $mullo(a, b) }); + (hi, lo) + } + } + }; + } + + #[cfg(target_feature = "sse2")] + wmul_impl_16! { u16x4, __m64, _mm_mulhi_pu16, _mm_mullo_pi16 } + #[cfg(target_feature = "sse4.2")] + wmul_impl_16! { u16x8, __m128i, _mm_mulhi_epu16, _mm_mullo_epi16 } + #[cfg(target_feature = "avx2")] + wmul_impl_16! { u16x16, __m256i, _mm256_mulhi_epu16, _mm256_mullo_epi16 } + // FIXME: there are no `__m512i` types in stdsimd yet, so `wmul::<u16x32>` + // cannot use the same implementation. + + wmul_impl! { + (u32x2, u64x2), + (u32x4, u64x4), + (u32x8, u64x8),, + 32 + } + + // TODO: optimize, this seems to seriously slow things down + wmul_impl_large! { (u8x64,) u8, 4 } + wmul_impl_large! { (u16x32,) u16, 8 } + wmul_impl_large! { (u32x16,) u32, 16 } + wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 } +} +#[cfg(all(feature = "simd_support", feature = "nightly"))] +pub use self::simd_wmul::*; + + +/// Helper trait when dealing with scalar and SIMD floating point types. +pub(crate) trait FloatSIMDUtils { + // `PartialOrd` for vectors compares lexicographically. We want to compare all + // the individual SIMD lanes instead, and get the combined result over all + // lanes. This is possible using something like `a.lt(b).all()`, but we + // implement it as a trait so we can write the same code for `f32` and `f64`. + // Only the comparison functions we need are implemented. + fn all_lt(self, other: Self) -> bool; + fn all_le(self, other: Self) -> bool; + fn all_finite(self) -> bool; + + type Mask; + fn finite_mask(self) -> Self::Mask; + fn gt_mask(self, other: Self) -> Self::Mask; + fn ge_mask(self, other: Self) -> Self::Mask; + + // Decrease all lanes where the mask is `true` to the next lower value + // representable by the floating-point type. At least one of the lanes + // must be set. + fn decrease_masked(self, mask: Self::Mask) -> Self; + + // Convert from int value. Conversion is done while retaining the numerical + // value, not by retaining the binary representation. + type UInt; + fn cast_from_int(i: Self::UInt) -> Self; +} + +/// Implement functions available in std builds but missing from core primitives +#[cfg(not(std))] +pub(crate) trait Float : Sized { + type Bits; + + fn is_nan(self) -> bool; + fn is_infinite(self) -> bool; + fn is_finite(self) -> bool; + fn to_bits(self) -> Self::Bits; + fn from_bits(v: Self::Bits) -> Self; +} + +/// Implement functions on f32/f64 to give them APIs similar to SIMD types +pub(crate) trait FloatAsSIMD : Sized { + #[inline(always)] + fn lanes() -> usize { 1 } + #[inline(always)] + fn splat(scalar: Self) -> Self { scalar } + #[inline(always)] + fn extract(self, index: usize) -> Self { debug_assert_eq!(index, 0); self } + #[inline(always)] + fn replace(self, index: usize, new_value: Self) -> Self { debug_assert_eq!(index, 0); new_value } +} + +pub(crate) trait BoolAsSIMD : Sized { + fn any(self) -> bool; + fn all(self) -> bool; + fn none(self) -> bool; +} + +impl BoolAsSIMD for bool { + #[inline(always)] + fn any(self) -> bool { self } + #[inline(always)] + fn all(self) -> bool { self } + #[inline(always)] + fn none(self) -> bool { !self } +} + +macro_rules! scalar_float_impl { + ($ty:ident, $uty:ident) => { + #[cfg(not(std))] + impl Float for $ty { + type Bits = $uty; + + #[inline] + fn is_nan(self) -> bool { + self != self + } + + #[inline] + fn is_infinite(self) -> bool { + self == ::core::$ty::INFINITY || self == ::core::$ty::NEG_INFINITY + } + + #[inline] + fn is_finite(self) -> bool { + !(self.is_nan() || self.is_infinite()) + } + + #[inline] + fn to_bits(self) -> Self::Bits { + unsafe { ::core::mem::transmute(self) } + } + + #[inline] + fn from_bits(v: Self::Bits) -> Self { + // It turns out the safety issues with sNaN were overblown! Hooray! + unsafe { ::core::mem::transmute(v) } + } + } + + impl FloatSIMDUtils for $ty { + type Mask = bool; + #[inline(always)] + fn all_lt(self, other: Self) -> bool { self < other } + #[inline(always)] + fn all_le(self, other: Self) -> bool { self <= other } + #[inline(always)] + fn all_finite(self) -> bool { self.is_finite() } + #[inline(always)] + fn finite_mask(self) -> Self::Mask { self.is_finite() } + #[inline(always)] + fn gt_mask(self, other: Self) -> Self::Mask { self > other } + #[inline(always)] + fn ge_mask(self, other: Self) -> Self::Mask { self >= other } + #[inline(always)] + fn decrease_masked(self, mask: Self::Mask) -> Self { + debug_assert!(mask, "At least one lane must be set"); + <$ty>::from_bits(self.to_bits() - 1) + } + type UInt = $uty; + fn cast_from_int(i: Self::UInt) -> Self { i as $ty } + } + + impl FloatAsSIMD for $ty {} + } +} + +scalar_float_impl!(f32, u32); +scalar_float_impl!(f64, u64); + + +#[cfg(feature="simd_support")] +macro_rules! simd_impl { + ($ty:ident, $f_scalar:ident, $mty:ident, $uty:ident) => { + impl FloatSIMDUtils for $ty { + type Mask = $mty; + #[inline(always)] + fn all_lt(self, other: Self) -> bool { self.lt(other).all() } + #[inline(always)] + fn all_le(self, other: Self) -> bool { self.le(other).all() } + #[inline(always)] + fn all_finite(self) -> bool { self.finite_mask().all() } + #[inline(always)] + fn finite_mask(self) -> Self::Mask { + // This can possibly be done faster by checking bit patterns + let neg_inf = $ty::splat(::core::$f_scalar::NEG_INFINITY); + let pos_inf = $ty::splat(::core::$f_scalar::INFINITY); + self.gt(neg_inf) & self.lt(pos_inf) + } + #[inline(always)] + fn gt_mask(self, other: Self) -> Self::Mask { self.gt(other) } + #[inline(always)] + fn ge_mask(self, other: Self) -> Self::Mask { self.ge(other) } + #[inline(always)] + fn decrease_masked(self, mask: Self::Mask) -> Self { + // Casting a mask into ints will produce all bits set for + // true, and 0 for false. Adding that to the binary + // representation of a float means subtracting one from + // the binary representation, resulting in the next lower + // value representable by $ty. This works even when the + // current value is infinity. + debug_assert!(mask.any(), "At least one lane must be set"); + <$ty>::from_bits(<$uty>::from_bits(self) + <$uty>::from_bits(mask)) + } + type UInt = $uty; + fn cast_from_int(i: Self::UInt) -> Self { i.cast() } + } + } +} + +#[cfg(feature="simd_support")] simd_impl! { f32x2, f32, m32x2, u32x2 } +#[cfg(feature="simd_support")] simd_impl! { f32x4, f32, m32x4, u32x4 } +#[cfg(feature="simd_support")] simd_impl! { f32x8, f32, m32x8, u32x8 } +#[cfg(feature="simd_support")] simd_impl! { f32x16, f32, m32x16, u32x16 } +#[cfg(feature="simd_support")] simd_impl! { f64x2, f64, m64x2, u64x2 } +#[cfg(feature="simd_support")] simd_impl! { f64x4, f64, m64x4, u64x4 } +#[cfg(feature="simd_support")] simd_impl! { f64x8, f64, m64x8, u64x8 } + +/// Calculates ln(gamma(x)) (natural logarithm of the gamma +/// function) using the Lanczos approximation. +/// +/// The approximation expresses the gamma function as: +/// `gamma(z+1) = sqrt(2*pi)*(z+g+0.5)^(z+0.5)*exp(-z-g-0.5)*Ag(z)` +/// `g` is an arbitrary constant; we use the approximation with `g=5`. +/// +/// Noting that `gamma(z+1) = z*gamma(z)` and applying `ln` to both sides: +/// `ln(gamma(z)) = (z+0.5)*ln(z+g+0.5)-(z+g+0.5) + ln(sqrt(2*pi)*Ag(z)/z)` +/// +/// `Ag(z)` is an infinite series with coefficients that can be calculated +/// ahead of time - we use just the first 6 terms, which is good enough +/// for most purposes. +#[cfg(feature="std")] +pub fn log_gamma(x: f64) -> f64 { + // precalculated 6 coefficients for the first 6 terms of the series + let coefficients: [f64; 6] = [ + 76.18009172947146, + -86.50532032941677, + 24.01409824083091, + -1.231739572450155, + 0.1208650973866179e-2, + -0.5395239384953e-5, + ]; + + // (x+0.5)*ln(x+g+0.5)-(x+g+0.5) + let tmp = x + 5.5; + let log = (x + 0.5) * tmp.ln() - tmp; + + // the first few terms of the series for Ag(x) + let mut a = 1.000000000190015; + let mut denom = x; + for coeff in &coefficients { + denom += 1.0; + a += coeff / denom; + } + + // get everything together + // a is Ag(x) + // 2.5066... is sqrt(2pi) + log + (2.5066282746310005 * a / x).ln() +} + +/// Sample a random number using the Ziggurat method (specifically the +/// ZIGNOR variant from Doornik 2005). Most of the arguments are +/// directly from the paper: +/// +/// * `rng`: source of randomness +/// * `symmetric`: whether this is a symmetric distribution, or one-sided with P(x < 0) = 0. +/// * `X`: the $x_i$ abscissae. +/// * `F`: precomputed values of the PDF at the $x_i$, (i.e. $f(x_i)$) +/// * `F_DIFF`: precomputed values of $f(x_i) - f(x_{i+1})$ +/// * `pdf`: the probability density function +/// * `zero_case`: manual sampling from the tail when we chose the +/// bottom box (i.e. i == 0) + +// the perf improvement (25-50%) is definitely worth the extra code +// size from force-inlining. +#[cfg(feature="std")] +#[inline(always)] +pub fn ziggurat<R: Rng + ?Sized, P, Z>( + rng: &mut R, + symmetric: bool, + x_tab: ziggurat_tables::ZigTable, + f_tab: ziggurat_tables::ZigTable, + mut pdf: P, + mut zero_case: Z) + -> f64 where P: FnMut(f64) -> f64, Z: FnMut(&mut R, f64) -> f64 { + use distributions::float::IntoFloat; + loop { + // As an optimisation we re-implement the conversion to a f64. + // From the remaining 12 most significant bits we use 8 to construct `i`. + // This saves us generating a whole extra random number, while the added + // precision of using 64 bits for f64 does not buy us much. + let bits = rng.next_u64(); + let i = bits as usize & 0xff; + + let u = if symmetric { + // Convert to a value in the range [2,4) and substract to get [-1,1) + // We can't convert to an open range directly, that would require + // substracting `3.0 - EPSILON`, which is not representable. + // It is possible with an extra step, but an open range does not + // seem neccesary for the ziggurat algorithm anyway. + (bits >> 12).into_float_with_exponent(1) - 3.0 + } else { + // Convert to a value in the range [1,2) and substract to get (0,1) + (bits >> 12).into_float_with_exponent(0) + - (1.0 - ::core::f64::EPSILON / 2.0) + }; + let x = u * x_tab[i]; + + let test_x = if symmetric { x.abs() } else {x}; + + // algebraically equivalent to |u| < x_tab[i+1]/x_tab[i] (or u < x_tab[i+1]/x_tab[i]) + if test_x < x_tab[i + 1] { + return x; + } + if i == 0 { + return zero_case(rng, u); + } + // algebraically equivalent to f1 + DRanU()*(f0 - f1) < 1 + if f_tab[i + 1] + (f_tab[i] - f_tab[i + 1]) * rng.gen::<f64>() < pdf(x) { + return x; + } + } +} |