cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

MathFunctions.h (8102B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_MATH_FUNCTIONS_AVX_H
     11 #define EIGEN_MATH_FUNCTIONS_AVX_H
     12 
     13 /* The sin and cos functions of this file are loosely derived from
     14  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
     15  */
     16 
     17 namespace Eigen {
     18 
     19 namespace internal {
     20 
     21 template <>
     22 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
     23 psin<Packet8f>(const Packet8f& _x) {
     24   return psin_float(_x);
     25 }
     26 
     27 template <>
     28 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
     29 pcos<Packet8f>(const Packet8f& _x) {
     30   return pcos_float(_x);
     31 }
     32 
     33 template <>
     34 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
     35 plog<Packet8f>(const Packet8f& _x) {
     36   return plog_float(_x);
     37 }
     38 
     39 template <>
     40 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
     41 plog<Packet4d>(const Packet4d& _x) {
     42   return plog_double(_x);
     43 }
     44 
     45 template <>
     46 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
     47 plog2<Packet8f>(const Packet8f& _x) {
     48   return plog2_float(_x);
     49 }
     50 
     51 template <>
     52 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
     53 plog2<Packet4d>(const Packet4d& _x) {
     54   return plog2_double(_x);
     55 }
     56 
     57 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
     58 Packet8f plog1p<Packet8f>(const Packet8f& _x) {
     59   return generic_plog1p(_x);
     60 }
     61 
     62 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
     63 Packet8f pexpm1<Packet8f>(const Packet8f& _x) {
     64   return generic_expm1(_x);
     65 }
     66 
     67 // Exponential function. Works by writing "x = m*log(2) + r" where
     68 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
     69 // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
     70 template <>
     71 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
     72 pexp<Packet8f>(const Packet8f& _x) {
     73   return pexp_float(_x);
     74 }
     75 
     76 // Hyperbolic Tangent function.
     77 template <>
     78 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
     79 ptanh<Packet8f>(const Packet8f& _x) {
     80   return internal::generic_fast_tanh_float(_x);
     81 }
     82 
     83 // Exponential function for doubles.
     84 template <>
     85 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
     86 pexp<Packet4d>(const Packet4d& _x) {
     87   return pexp_double(_x);
     88 }
     89 
     90 // Functions for sqrt.
     91 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
     92 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
     93 // exact solution. It does not handle +inf, or denormalized numbers correctly.
     94 // The main advantage of this approach is not just speed, but also the fact that
     95 // it can be inlined and pipelined with other computations, further reducing its
     96 // effective latency. This is similar to Quake3's fast inverse square root.
     97 // For detail see here: http://www.beyond3d.com/content/articles/8/
     98 #if EIGEN_FAST_MATH
     99 template <>
    100 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
    101 Packet8f psqrt<Packet8f>(const Packet8f& _x) {
    102   Packet8f minus_half_x = pmul(_x, pset1<Packet8f>(-0.5f));
    103   Packet8f denormal_mask = pandnot(
    104       pcmp_lt(_x, pset1<Packet8f>((std::numeric_limits<float>::min)())),
    105       pcmp_lt(_x, pzero(_x)));
    106 
    107   // Compute approximate reciprocal sqrt.
    108   Packet8f x = _mm256_rsqrt_ps(_x);
    109   // Do a single step of Newton's iteration.
    110   x = pmul(x, pmadd(minus_half_x, pmul(x,x), pset1<Packet8f>(1.5f)));
    111   // Flush results for denormals to zero.
    112   return pandnot(pmul(_x,x), denormal_mask);
    113 }
    114 
    115 #else
    116 
    117 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
    118 Packet8f psqrt<Packet8f>(const Packet8f& _x) {
    119   return _mm256_sqrt_ps(_x);
    120 }
    121 
    122 #endif
    123 
    124 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
    125 Packet4d psqrt<Packet4d>(const Packet4d& _x) {
    126   return _mm256_sqrt_pd(_x);
    127 }
    128 
    129 #if EIGEN_FAST_MATH
    130 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
    131 Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
    132   _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
    133   _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
    134   _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
    135   _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
    136 
    137   Packet8f neg_half = pmul(_x, p8f_minus_half);
    138 
    139   // select only the inverse sqrt of positive normal inputs (denormals are
    140   // flushed to zero and cause infs as well).
    141   Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
    142   Packet8f inf_mask =  _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ);
    143   Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask);
    144 
    145   // Compute an approximate result using the rsqrt intrinsic.
    146   Packet8f y_approx = _mm256_rsqrt_ps(_x);
    147 
    148   // Do a single step of Newton-Raphson iteration to improve the approximation.
    149   // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
    150   // It is essential to evaluate the inner term like this because forming
    151   // y_n^2 may over- or underflow.
    152   Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five));
    153 
    154   // Select the result of the Newton-Raphson step for positive normal arguments.
    155   // For other arguments, choose the output of the intrinsic. This will
    156   // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
    157   // x is zero or a positive denormalized float (equivalent to flushing positive
    158   // denormalized inputs to zero).
    159   return pselect<Packet8f>(not_normal_finite_mask, y_approx, y_newton);
    160 }
    161 
    162 #else
    163 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
    164 Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
    165   _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
    166   return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(_x));
    167 }
    168 #endif
    169 
    170 template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
    171 Packet4d prsqrt<Packet4d>(const Packet4d& _x) {
    172   _EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
    173   return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x));
    174 }
    175 
    176 F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
    177 F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
    178 F16_PACKET_FUNCTION(Packet8f, Packet8h, plog)
    179 F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2)
    180 F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p)
    181 F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1)
    182 F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
    183 F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
    184 F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
    185 F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
    186 
    187 template <>
    188 EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) {
    189   Packet8f fexponent;
    190   const Packet8h out = float2half(pfrexp<Packet8f>(half2float(a), fexponent));
    191   exponent = float2half(fexponent);
    192   return out;
    193 }
    194 
    195 template <>
    196 EIGEN_STRONG_INLINE Packet8h pldexp(const Packet8h& a, const Packet8h& exponent) {
    197   return float2half(pldexp<Packet8f>(half2float(a), half2float(exponent)));
    198 }
    199 
    200 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
    201 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos)
    202 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog)
    203 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2)
    204 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p)
    205 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1)
    206 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp)
    207 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
    208 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
    209 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt)
    210 
    211 template <>
    212 EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) {
    213   Packet8f fexponent;
    214   const Packet8bf out = F32ToBf16(pfrexp<Packet8f>(Bf16ToF32(a), fexponent));
    215   exponent = F32ToBf16(fexponent);
    216   return out;
    217 }
    218 
    219 template <>
    220 EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& exponent) {
    221   return F32ToBf16(pldexp<Packet8f>(Bf16ToF32(a), Bf16ToF32(exponent)));
    222 }
    223 
    224 }  // end namespace internal
    225 
    226 }  // end namespace Eigen
    227 
    228 #endif  // EIGEN_MATH_FUNCTIONS_AVX_H