MathFunctions.h (13344B)
1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com) 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ 11 #define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ 12 13 namespace Eigen { 14 15 namespace internal { 16 17 // Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics. 18 #if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 19 20 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ 21 const Packet16f p16f_##NAME = pset1<Packet16f>(X) 22 23 #define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \ 24 const Packet16f p16f_##NAME = preinterpret<Packet16f,Packet16i>(pset1<Packet16i>(X)) 25 26 #define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \ 27 const Packet8d p8d_##NAME = pset1<Packet8d>(X) 28 29 #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ 30 const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X)) 31 32 #define _EIGEN_DECLARE_CONST_Packet16bf(NAME, X) \ 33 const Packet16bf p16bf_##NAME = pset1<Packet16bf>(X) 34 35 #define _EIGEN_DECLARE_CONST_Packet16bf_FROM_INT(NAME, X) \ 36 const Packet16bf p16bf_##NAME = preinterpret<Packet16bf,Packet16i>(pset1<Packet16i>(X)) 37 38 template <> 39 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f 40 plog<Packet16f>(const Packet16f& _x) { 41 return plog_float(_x); 42 } 43 44 template <> 45 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d 46 plog<Packet8d>(const Packet8d& _x) { 47 return plog_double(_x); 48 } 49 50 F16_PACKET_FUNCTION(Packet16f, Packet16h, plog) 51 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog) 52 53 template <> 54 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f 55 plog2<Packet16f>(const Packet16f& _x) { 56 return plog2_float(_x); 57 } 58 59 template <> 60 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d 61 plog2<Packet8d>(const Packet8d& _x) { 62 return plog2_double(_x); 63 } 64 65 F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2) 66 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2) 67 68 // Exponential function. Works by writing "x = m*log(2) + r" where 69 // "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then 70 // "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). 71 template <> 72 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f 73 pexp<Packet16f>(const Packet16f& _x) { 74 _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f); 75 _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f); 76 _EIGEN_DECLARE_CONST_Packet16f(127, 127.0f); 77 78 _EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f); 79 _EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f); 80 81 _EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f); 82 83 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f); 84 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f); 85 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f); 86 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f); 87 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f); 88 _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f); 89 90 // Clamp x. 91 Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo); 92 93 // Express exp(x) as exp(m*ln(2) + r), start by extracting 94 // m = floor(x/ln(2) + 0.5). 95 Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half)); 96 97 // Get r = x - m*ln(2). Note that we can do this without losing more than one 98 // ulp precision due to the FMA instruction. 99 _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f); 100 Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x); 101 Packet16f r2 = pmul(r, r); 102 Packet16f r3 = pmul(r2, r); 103 104 // Evaluate the polynomial approximant,improved by instruction-level parallelism. 105 Packet16f y, y1, y2; 106 y = pmadd(p16f_cephes_exp_p0, r, p16f_cephes_exp_p1); 107 y1 = pmadd(p16f_cephes_exp_p3, r, p16f_cephes_exp_p4); 108 y2 = padd(r, p16f_1); 109 y = pmadd(y, r, p16f_cephes_exp_p2); 110 y1 = pmadd(y1, r, p16f_cephes_exp_p5); 111 y = pmadd(y, r3, y1); 112 y = pmadd(y, r2, y2); 113 114 // Build emm0 = 2^m. 115 Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127)); 116 emm0 = _mm512_slli_epi32(emm0, 23); 117 118 // Return 2^m * exp(r). 119 return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x); 120 } 121 122 template <> 123 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d 124 pexp<Packet8d>(const Packet8d& _x) { 125 return pexp_double(_x); 126 } 127 128 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp) 129 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp) 130 131 template <> 132 EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h& a, Packet16h& exponent) { 133 Packet16f fexponent; 134 const Packet16h out = float2half(pfrexp<Packet16f>(half2float(a), fexponent)); 135 exponent = float2half(fexponent); 136 return out; 137 } 138 139 template <> 140 EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h& a, const Packet16h& exponent) { 141 return float2half(pldexp<Packet16f>(half2float(a), half2float(exponent))); 142 } 143 144 template <> 145 EIGEN_STRONG_INLINE Packet16bf pfrexp(const Packet16bf& a, Packet16bf& exponent) { 146 Packet16f fexponent; 147 const Packet16bf out = F32ToBf16(pfrexp<Packet16f>(Bf16ToF32(a), fexponent)); 148 exponent = F32ToBf16(fexponent); 149 return out; 150 } 151 152 template <> 153 EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exponent) { 154 return F32ToBf16(pldexp<Packet16f>(Bf16ToF32(a), Bf16ToF32(exponent))); 155 } 156 157 // Functions for sqrt. 158 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step 159 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the 160 // exact solution. The main advantage of this approach is not just speed, but 161 // also the fact that it can be inlined and pipelined with other computations, 162 // further reducing its effective latency. 163 #if EIGEN_FAST_MATH 164 template <> 165 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f 166 psqrt<Packet16f>(const Packet16f& _x) { 167 Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f)); 168 __mmask16 denormal_mask = _mm512_kand( 169 _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()), 170 _CMP_LT_OQ), 171 _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ)); 172 173 Packet16f x = _mm512_rsqrt14_ps(_x); 174 175 // Do a single step of Newton's iteration. 176 x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f))); 177 178 // Flush results for denormals to zero. 179 return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps()); 180 } 181 182 template <> 183 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d 184 psqrt<Packet8d>(const Packet8d& _x) { 185 Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5)); 186 __mmask16 denormal_mask = _mm512_kand( 187 _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()), 188 _CMP_LT_OQ), 189 _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ)); 190 191 Packet8d x = _mm512_rsqrt14_pd(_x); 192 193 // Do a single step of Newton's iteration. 194 x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5))); 195 196 // Do a second step of Newton's iteration. 197 x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5))); 198 199 return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd()); 200 } 201 #else 202 template <> 203 EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) { 204 return _mm512_sqrt_ps(x); 205 } 206 207 template <> 208 EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) { 209 return _mm512_sqrt_pd(x); 210 } 211 #endif 212 213 F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt) 214 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt) 215 216 // prsqrt for float. 217 #if defined(EIGEN_VECTORIZE_AVX512ER) 218 219 template <> 220 EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) { 221 return _mm512_rsqrt28_ps(x); 222 } 223 #elif EIGEN_FAST_MATH 224 225 template <> 226 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f 227 prsqrt<Packet16f>(const Packet16f& _x) { 228 _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000); 229 _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); 230 _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); 231 232 Packet16f neg_half = pmul(_x, p16f_minus_half); 233 234 // Identity infinite, negative and denormal arguments. 235 __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ); 236 __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ); 237 __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask; 238 239 // Compute an approximate result using the rsqrt intrinsic, forcing +inf 240 // for denormals for consistency with AVX and SSE implementations. 241 Packet16f y_approx = _mm512_rsqrt14_ps(_x); 242 243 // Do a single step of Newton-Raphson iteration to improve the approximation. 244 // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). 245 // It is essential to evaluate the inner term like this because forming 246 // y_n^2 may over- or underflow. 247 Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five)); 248 249 // Select the result of the Newton-Raphson step for positive finite arguments. 250 // For other arguments, choose the output of the intrinsic. This will 251 // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. 252 return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx); 253 } 254 #else 255 256 template <> 257 EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) { 258 _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f); 259 return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x)); 260 } 261 #endif 262 263 F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt) 264 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt) 265 266 // prsqrt for double. 267 #if EIGEN_FAST_MATH 268 template <> 269 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d 270 prsqrt<Packet8d>(const Packet8d& _x) { 271 _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); 272 _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); 273 _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); 274 275 Packet8d neg_half = pmul(_x, p8d_minus_half); 276 277 // Identity infinite, negative and denormal arguments. 278 __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ); 279 __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ); 280 __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask; 281 282 // Compute an approximate result using the rsqrt intrinsic, forcing +inf 283 // for denormals for consistency with AVX and SSE implementations. 284 #if defined(EIGEN_VECTORIZE_AVX512ER) 285 Packet8d y_approx = _mm512_rsqrt28_pd(_x); 286 #else 287 Packet8d y_approx = _mm512_rsqrt14_pd(_x); 288 #endif 289 // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the 290 // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available). 291 // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number 292 // of correct digits for each step. 293 // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). 294 // It is essential to evaluate the inner term like this because forming 295 // y_n^2 may over- or underflow. 296 Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five)); 297 #if !defined(EIGEN_VECTORIZE_AVX512ER) 298 y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five)); 299 #endif 300 // Select the result of the Newton-Raphson step for positive finite arguments. 301 // For other arguments, choose the output of the intrinsic. This will 302 // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. 303 return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx); 304 } 305 #else 306 template <> 307 EIGEN_STRONG_INLINE Packet8d prsqrt<Packet8d>(const Packet8d& x) { 308 _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f); 309 return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x)); 310 } 311 #endif 312 313 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 314 Packet16f plog1p<Packet16f>(const Packet16f& _x) { 315 return generic_plog1p(_x); 316 } 317 318 F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p) 319 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p) 320 321 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED 322 Packet16f pexpm1<Packet16f>(const Packet16f& _x) { 323 return generic_expm1(_x); 324 } 325 326 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1) 327 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1) 328 329 #endif 330 331 332 template <> 333 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f 334 psin<Packet16f>(const Packet16f& _x) { 335 return psin_float(_x); 336 } 337 338 template <> 339 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f 340 pcos<Packet16f>(const Packet16f& _x) { 341 return pcos_float(_x); 342 } 343 344 template <> 345 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f 346 ptanh<Packet16f>(const Packet16f& _x) { 347 return internal::generic_fast_tanh_float(_x); 348 } 349 350 F16_PACKET_FUNCTION(Packet16f, Packet16h, psin) 351 F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos) 352 F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh) 353 354 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin) 355 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos) 356 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh) 357 358 } // end namespace internal 359 360 } // end namespace Eigen 361 362 #endif // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_