PacketMath.h - cart-elc - Source code for CART-ELC

PacketMath.h (57047B)
      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_PACKET_MATH_GPU_H
     11 #define EIGEN_PACKET_MATH_GPU_H
     12 
     13 namespace Eigen {
     14 
     15 namespace internal {
     16 
     17 // Read-only data cached load available.
     18 #if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
     19 #define EIGEN_GPU_HAS_LDG 1
     20 #endif
     21 
     22 // FP16 math available.
     23 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
     24 #define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
     25 #endif
     26 
     27 #if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
     28 #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
     29 #endif
     30 
     31 // Make sure this is only available when targeting a GPU: we don't want to
     32 // introduce conflicts between these packet_traits definitions and the ones
     33 // we'll use on the host side (SSE, AVX, ...)
     34 #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
     35 
     36 template<> struct is_arithmetic<float4>  { enum { value = true }; };
     37 template<> struct is_arithmetic<double2> { enum { value = true }; };
     38 
     39 template<> struct packet_traits<float> : default_packet_traits
     40 {
     41   typedef float4 type;
     42   typedef float4 half;
     43   enum {
     44     Vectorizable = 1,
     45     AlignedOnScalar = 1,
     46     size=4,
     47     HasHalfPacket = 0,
     48 
     49     HasDiv  = 1,
     50     HasSin  = 0,
     51     HasCos  = 0,
     52     HasLog  = 1,
     53     HasExp  = 1,
     54     HasSqrt = 1,
     55     HasRsqrt = 1,
     56     HasLGamma = 1,
     57     HasDiGamma = 1,
     58     HasZeta = 1,
     59     HasPolygamma = 1,
     60     HasErf = 1,
     61     HasErfc = 1,
     62     HasNdtri = 1,
     63     HasBessel = 1,
     64     HasIGamma = 1,
     65     HasIGammaDerA = 1,
     66     HasGammaSampleDerAlpha = 1,
     67     HasIGammac = 1,
     68     HasBetaInc = 1,
     69 
     70     HasBlend = 0,
     71     HasFloor = 1,
     72   };
     73 };
     74 
     75 template<> struct packet_traits<double> : default_packet_traits
     76 {
     77   typedef double2 type;
     78   typedef double2 half;
     79   enum {
     80     Vectorizable = 1,
     81     AlignedOnScalar = 1,
     82     size=2,
     83     HasHalfPacket = 0,
     84 
     85     HasDiv  = 1,
     86     HasLog  = 1,
     87     HasExp  = 1,
     88     HasSqrt = 1,
     89     HasRsqrt = 1,
     90     HasLGamma = 1,
     91     HasDiGamma = 1,
     92     HasZeta = 1,
     93     HasPolygamma = 1,
     94     HasErf = 1,
     95     HasErfc = 1,
     96     HasNdtri = 1,
     97     HasBessel = 1,
     98     HasIGamma = 1,
     99     HasIGammaDerA = 1,
    100     HasGammaSampleDerAlpha = 1,
    101     HasIGammac = 1,
    102     HasBetaInc = 1,
    103 
    104     HasBlend = 0,
    105     HasFloor = 1,
    106   };
    107 };
    108 
    109 
    110 template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; };
    111 template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; };
    112 
    113 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
    114   return make_float4(from, from, from, from);
    115 }
    116 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
    117   return make_double2(from, from);
    118 }
    119 
    120 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
    121 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
    122 // of the functions, while the latter can only deal with one of them.
    123 #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
    124 namespace {
    125 
    126 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
    127                                                         const float& b) {
    128   return __int_as_float(__float_as_int(a) & __float_as_int(b));
    129 }
    130 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
    131                                                          const double& b) {
    132   return __longlong_as_double(__double_as_longlong(a) &
    133                               __double_as_longlong(b));
    134 }
    135 
    136 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
    137                                                        const float& b) {
    138   return __int_as_float(__float_as_int(a) | __float_as_int(b));
    139 }
    140 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
    141                                                         const double& b) {
    142   return __longlong_as_double(__double_as_longlong(a) |
    143                               __double_as_longlong(b));
    144 }
    145 
    146 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
    147                                                         const float& b) {
    148   return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
    149 }
    150 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
    151                                                          const double& b) {
    152   return __longlong_as_double(__double_as_longlong(a) ^
    153                               __double_as_longlong(b));
    154 }
    155 
    156 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
    157                                                            const float& b) {
    158   return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
    159 }
    160 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
    161                                                             const double& b) {
    162   return __longlong_as_double(__double_as_longlong(a) &
    163                               ~__double_as_longlong(b));
    164 }
    165 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
    166                                                     const float& b) {
    167   return __int_as_float(a == b ? 0xffffffffu : 0u);
    168 }
    169 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
    170                                                      const double& b) {
    171   return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
    172 }
    173 
    174 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a,
    175                                                     const float& b) {
    176   return __int_as_float(a < b ? 0xffffffffu : 0u);
    177 }
    178 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a,
    179                                                      const double& b) {
    180   return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
    181 }
    182 
    183 }  // namespace
    184 
    185 template <>
    186 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
    187                                                           const float4& b) {
    188   return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
    189                      bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
    190 }
    191 template <>
    192 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
    193                                                             const double2& b) {
    194   return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
    195 }
    196 
    197 template <>
    198 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
    199                                                          const float4& b) {
    200   return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
    201                      bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
    202 }
    203 template <>
    204 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
    205                                                            const double2& b) {
    206   return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
    207 }
    208 
    209 template <>
    210 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
    211                                                           const float4& b) {
    212   return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
    213                      bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
    214 }
    215 template <>
    216 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
    217                                                             const double2& b) {
    218   return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
    219 }
    220 
    221 template <>
    222 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
    223                                                              const float4& b) {
    224   return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
    225                      bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
    226 }
    227 template <>
    228 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
    229 pandnot<double2>(const double2& a, const double2& b) {
    230   return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
    231 }
    232 
    233 template <>
    234 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
    235                                                              const float4& b) {
    236   return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
    237                      eq_mask(a.w, b.w));
    238 }
    239 template <>
    240 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a,
    241                                                              const float4& b) {
    242   return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
    243                      lt_mask(a.w, b.w));
    244 }
    245 template <>
    246 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
    247 pcmp_eq<double2>(const double2& a, const double2& b) {
    248   return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
    249 }
    250 template <>
    251 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
    252 pcmp_lt<double2>(const double2& a, const double2& b) {
    253   return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
    254 }
    255 #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
    256 
    257 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
    258   return make_float4(a, a+1, a+2, a+3);
    259 }
    260 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
    261   return make_double2(a, a+1);
    262 }
    263 
    264 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
    265   return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
    266 }
    267 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
    268   return make_double2(a.x+b.x, a.y+b.y);
    269 }
    270 
    271 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
    272   return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
    273 }
    274 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
    275   return make_double2(a.x-b.x, a.y-b.y);
    276 }
    277 
    278 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
    279   return make_float4(-a.x, -a.y, -a.z, -a.w);
    280 }
    281 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
    282   return make_double2(-a.x, -a.y);
    283 }
    284 
    285 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
    286 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
    287 
    288 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
    289   return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
    290 }
    291 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
    292   return make_double2(a.x*b.x, a.y*b.y);
    293 }
    294 
    295 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
    296   return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
    297 }
    298 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
    299   return make_double2(a.x/b.x, a.y/b.y);
    300 }
    301 
    302 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
    303   return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
    304 }
    305 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
    306   return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
    307 }
    308 
    309 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
    310   return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
    311 }
    312 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
    313   return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
    314 }
    315 
    316 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
    317   return *reinterpret_cast<const float4*>(from);
    318 }
    319 
    320 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
    321   return *reinterpret_cast<const double2*>(from);
    322 }
    323 
    324 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
    325   return make_float4(from[0], from[1], from[2], from[3]);
    326 }
    327 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
    328   return make_double2(from[0], from[1]);
    329 }
    330 
    331 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
    332   return make_float4(from[0], from[0], from[1], from[1]);
    333 }
    334 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
    335   return make_double2(from[0], from[0]);
    336 }
    337 
    338 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
    339   *reinterpret_cast<float4*>(to) = from;
    340 }
    341 
    342 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
    343   *reinterpret_cast<double2*>(to) = from;
    344 }
    345 
    346 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
    347   to[0] = from.x;
    348   to[1] = from.y;
    349   to[2] = from.z;
    350   to[3] = from.w;
    351 }
    352 
    353 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
    354   to[0] = from.x;
    355   to[1] = from.y;
    356 }
    357 
    358 template<>
    359 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
    360 #if defined(EIGEN_GPU_HAS_LDG)
    361   return __ldg((const float4*)from);
    362 #else
    363   return make_float4(from[0], from[1], from[2], from[3]);
    364 #endif
    365 }
    366 template<>
    367 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
    368 #if defined(EIGEN_GPU_HAS_LDG)
    369   return __ldg((const double2*)from);
    370 #else
    371   return make_double2(from[0], from[1]);
    372 #endif
    373 }
    374 
    375 template<>
    376 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
    377 #if defined(EIGEN_GPU_HAS_LDG)
    378   return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
    379 #else
    380   return make_float4(from[0], from[1], from[2], from[3]);
    381 #endif
    382 }
    383 template<>
    384 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
    385 #if defined(EIGEN_GPU_HAS_LDG)
    386   return make_double2(__ldg(from+0), __ldg(from+1));
    387 #else
    388   return make_double2(from[0], from[1]);
    389 #endif
    390 }
    391 
    392 template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
    393   return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
    394 }
    395 
    396 template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
    397   return make_double2(from[0*stride], from[1*stride]);
    398 }
    399 
    400 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
    401   to[stride*0] = from.x;
    402   to[stride*1] = from.y;
    403   to[stride*2] = from.z;
    404   to[stride*3] = from.w;
    405 }
    406 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
    407   to[stride*0] = from.x;
    408   to[stride*1] = from.y;
    409 }
    410 
    411 template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
    412   return a.x;
    413 }
    414 template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
    415   return a.x;
    416 }
    417 
    418 template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
    419   return a.x + a.y + a.z + a.w;
    420 }
    421 template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
    422   return a.x + a.y;
    423 }
    424 
    425 template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
    426   return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
    427 }
    428 template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
    429   return fmax(a.x, a.y);
    430 }
    431 
    432 template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
    433   return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
    434 }
    435 template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
    436   return fmin(a.x, a.y);
    437 }
    438 
    439 template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
    440   return a.x * a.y * a.z * a.w;
    441 }
    442 template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
    443   return a.x * a.y;
    444 }
    445 
    446 template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
    447   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
    448 }
    449 template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
    450   return make_double2(fabs(a.x), fabs(a.y));
    451 }
    452 
    453 template<> EIGEN_DEVICE_FUNC inline float4  pfloor<float4>(const float4& a) {
    454   return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
    455 }
    456 template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
    457   return make_double2(floor(a.x), floor(a.y));
    458 }
    459 
    460 EIGEN_DEVICE_FUNC inline void
    461 ptranspose(PacketBlock<float4,4>& kernel) {
    462   float tmp = kernel.packet[0].y;
    463   kernel.packet[0].y = kernel.packet[1].x;
    464   kernel.packet[1].x = tmp;
    465 
    466   tmp = kernel.packet[0].z;
    467   kernel.packet[0].z = kernel.packet[2].x;
    468   kernel.packet[2].x = tmp;
    469 
    470   tmp = kernel.packet[0].w;
    471   kernel.packet[0].w = kernel.packet[3].x;
    472   kernel.packet[3].x = tmp;
    473 
    474   tmp = kernel.packet[1].z;
    475   kernel.packet[1].z = kernel.packet[2].y;
    476   kernel.packet[2].y = tmp;
    477 
    478   tmp = kernel.packet[1].w;
    479   kernel.packet[1].w = kernel.packet[3].y;
    480   kernel.packet[3].y = tmp;
    481 
    482   tmp = kernel.packet[2].w;
    483   kernel.packet[2].w = kernel.packet[3].z;
    484   kernel.packet[3].z = tmp;
    485 }
    486 
    487 EIGEN_DEVICE_FUNC inline void
    488 ptranspose(PacketBlock<double2,2>& kernel) {
    489   double tmp = kernel.packet[0].y;
    490   kernel.packet[0].y = kernel.packet[1].x;
    491   kernel.packet[1].x = tmp;
    492 }
    493 
    494 #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
    495 
    496 // Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
    497 // its corresponding packet_traits<Eigen::half> must be visible on host.
    498 #if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
    499 
    500 typedef ulonglong2 Packet4h2;
    501 template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
    502 template<> struct is_arithmetic<Packet4h2> { enum { value = true }; };
    503 
    504 template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
    505 template<> struct is_arithmetic<half2> { enum { value = true }; };
    506 
    507 template<> struct packet_traits<Eigen::half> : default_packet_traits
    508 {
    509   typedef Packet4h2 type;
    510   typedef Packet4h2 half;
    511   enum {
    512     Vectorizable = 1,
    513     AlignedOnScalar = 1,
    514     size=8,
    515     HasHalfPacket = 0,
    516     HasAdd    = 1,
    517     HasSub    = 1,
    518     HasMul    = 1,
    519     HasDiv    = 1,
    520     HasSqrt   = 1,
    521     HasRsqrt  = 1,
    522     HasExp    = 1,
    523     HasExpm1  = 1,
    524     HasLog    = 1,
    525     HasLog1p  = 1
    526   };
    527 };
    528 
    529 namespace {
    530 // This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
    531 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
    532 #if defined(EIGEN_GPU_COMPILE_PHASE)
    533   return __halves2half2(a, b);
    534 #else
    535   // Round-about way since __halves2half2 is a __device__ function.
    536   return __floats2half2_rn(__half2float(a), __half2float(b));
    537 #endif
    538 }
    539 
    540 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
    541 #if defined(EIGEN_GPU_COMPILE_PHASE)
    542   return __low2half(a);
    543 #else
    544   return __float2half(__low2float(a));
    545 #endif
    546 }
    547 
    548 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
    549 #if defined(EIGEN_GPU_COMPILE_PHASE)
    550   return __high2half(a);
    551 #else
    552   return __float2half(__high2float(a));
    553 #endif
    554 }
    555 } // namespace
    556 
    557 template<>
    558 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
    559 #if defined(EIGEN_GPU_COMPILE_PHASE)
    560   return __half2half2(from);
    561 #else
    562   const float f = __half2float(from);
    563   return __floats2half2_rn(f, f);
    564 #endif
    565 }
    566 
    567 template <>
    568 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
    569 pset1<Packet4h2>(const Eigen::half& from) {
    570   Packet4h2 r;
    571   half2* p_alias = reinterpret_cast<half2*>(&r);
    572   p_alias[0] = pset1<half2>(from);
    573   p_alias[1] = pset1<half2>(from);
    574   p_alias[2] = pset1<half2>(from);
    575   p_alias[3] = pset1<half2>(from);
    576   return r;
    577 }
    578 
    579 // We now need this visible on both host and device.
    580 // #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
    581 namespace {
    582 
    583 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
    584   return *reinterpret_cast<const half2*>(from);
    585 }
    586 
    587 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
    588   return combine_half(from[0], from[1]);
    589 }
    590 
    591 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half*  from) {
    592   return combine_half(from[0], from[0]);
    593 }
    594 
    595 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
    596                                                   const half2& from) {
    597   *reinterpret_cast<half2*>(to) = from;
    598 }
    599 
    600 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
    601                                                    const half2& from) {
    602   to[0] = get_half2_low(from);
    603   to[1] = get_half2_high(from);
    604 }
    605 
    606 
    607 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
    608     const Eigen::half* from) {
    609 #if defined(EIGEN_GPU_HAS_LDG)
    610   // Input is guaranteed to be properly aligned.
    611   return __ldg(reinterpret_cast<const half2*>(from));
    612 #else
    613   return combine_half(*(from+0), *(from+1));
    614 #endif
    615 }
    616 
    617 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
    618     const Eigen::half* from) {
    619 #if defined(EIGEN_GPU_HAS_LDG)
    620   return __halves2half2(__ldg(from+0), __ldg(from+1));
    621 #else
    622   return combine_half(*(from+0), *(from+1));
    623 #endif
    624 }
    625 
    626 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
    627                                                     Index stride) {
    628   return combine_half(from[0*stride], from[1*stride]);
    629 }
    630 
    631 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
    632     Eigen::half* to, const half2& from, Index stride) {
    633   to[stride*0] = get_half2_low(from);
    634   to[stride*1] = get_half2_high(from);
    635 }
    636 
    637 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
    638   return get_half2_low(a);
    639 }
    640 
    641 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
    642   half a1 = get_half2_low(a);
    643   half a2 = get_half2_high(a);
    644   half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
    645   half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
    646   return combine_half(result1, result2);
    647 }
    648 
    649 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
    650   half true_half = half_impl::raw_uint16_to_half(0xffffu);
    651   return pset1<half2>(true_half);
    652 }
    653 
    654 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
    655   half false_half = half_impl::raw_uint16_to_half(0x0000u);
    656   return pset1<half2>(false_half);
    657 }
    658 
    659 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
    660 ptranspose(PacketBlock<half2,2>& kernel) {
    661   __half a1 = get_half2_low(kernel.packet[0]);
    662   __half a2 = get_half2_high(kernel.packet[0]);
    663   __half b1 = get_half2_low(kernel.packet[1]);
    664   __half b2 = get_half2_high(kernel.packet[1]);
    665   kernel.packet[0] = combine_half(a1, b1);
    666   kernel.packet[1] = combine_half(a2, b2);
    667 }
    668 
    669 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
    670 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
    671   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
    672 #else
    673   float f = __half2float(a) + 1.0f;
    674   return combine_half(a, __float2half(f));
    675 #endif
    676 }
    677 
    678 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
    679                                                     const half2& a,
    680                                                     const half2& b) {
    681   half mask_low = get_half2_low(mask);
    682   half mask_high = get_half2_high(mask);
    683   half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
    684   half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
    685   return combine_half(result_low, result_high);
    686 }
    687 
    688 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
    689                                                     const half2& b) {
    690   half true_half = half_impl::raw_uint16_to_half(0xffffu);
    691   half false_half = half_impl::raw_uint16_to_half(0x0000u);
    692   half a1 = get_half2_low(a);
    693   half a2 = get_half2_high(a);
    694   half b1 = get_half2_low(b);
    695   half b2 = get_half2_high(b);
    696   half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
    697   half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
    698   return combine_half(eq1, eq2);
    699 }
    700 
    701 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
    702                                                     const half2& b) {
    703   half true_half = half_impl::raw_uint16_to_half(0xffffu);
    704   half false_half = half_impl::raw_uint16_to_half(0x0000u);
    705   half a1 = get_half2_low(a);
    706   half a2 = get_half2_high(a);
    707   half b1 = get_half2_low(b);
    708   half b2 = get_half2_high(b);
    709   half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
    710   half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
    711   return combine_half(eq1, eq2);
    712 }
    713 
    714 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
    715                                                  const half2& b) {
    716   half a1 = get_half2_low(a);
    717   half a2 = get_half2_high(a);
    718   half b1 = get_half2_low(b);
    719   half b2 = get_half2_high(b);
    720   half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
    721   half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
    722   return combine_half(result1, result2);
    723 }
    724 
    725 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
    726                                                 const half2& b) {
    727   half a1 = get_half2_low(a);
    728   half a2 = get_half2_high(a);
    729   half b1 = get_half2_low(b);
    730   half b2 = get_half2_high(b);
    731   half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
    732   half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
    733   return combine_half(result1, result2);
    734 }
    735 
    736 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
    737                                                  const half2& b) {
    738   half a1 = get_half2_low(a);
    739   half a2 = get_half2_high(a);
    740   half b1 = get_half2_low(b);
    741   half b2 = get_half2_high(b);
    742   half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
    743   half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
    744   return combine_half(result1, result2);
    745 }
    746 
    747 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
    748                                                     const half2& b) {
    749   half a1 = get_half2_low(a);
    750   half a2 = get_half2_high(a);
    751   half b1 = get_half2_low(b);
    752   half b2 = get_half2_high(b);
    753   half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
    754   half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
    755   return combine_half(result1, result2);
    756 }
    757 
    758 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
    759                                                  const half2& b) {
    760 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
    761   return __hadd2(a, b);
    762 #else
    763   float a1 = __low2float(a);
    764   float a2 = __high2float(a);
    765   float b1 = __low2float(b);
    766   float b2 = __high2float(b);
    767   float r1 = a1 + b1;
    768   float r2 = a2 + b2;
    769   return __floats2half2_rn(r1, r2);
    770 #endif
    771 }
    772 
    773 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
    774                                                  const half2& b) {
    775 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
    776   return __hsub2(a, b);
    777 #else
    778   float a1 = __low2float(a);
    779   float a2 = __high2float(a);
    780   float b1 = __low2float(b);
    781   float b2 = __high2float(b);
    782   float r1 = a1 - b1;
    783   float r2 = a2 - b2;
    784   return __floats2half2_rn(r1, r2);
    785 #endif
    786 }
    787 
    788 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
    789 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
    790   return __hneg2(a);
    791 #else
    792   float a1 = __low2float(a);
    793   float a2 = __high2float(a);
    794   return __floats2half2_rn(-a1, -a2);
    795 #endif
    796 }
    797 
    798 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
    799 
    800 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
    801                                                  const half2& b) {
    802 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
    803   return __hmul2(a, b);
    804 #else
    805   float a1 = __low2float(a);
    806   float a2 = __high2float(a);
    807   float b1 = __low2float(b);
    808   float b2 = __high2float(b);
    809   float r1 = a1 * b1;
    810   float r2 = a2 * b2;
    811   return __floats2half2_rn(r1, r2);
    812 #endif
    813 }
    814 
    815 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
    816                                                   const half2& b,
    817                                                   const half2& c) {
    818 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
    819    return __hfma2(a, b, c);
    820 #else
    821   float a1 = __low2float(a);
    822   float a2 = __high2float(a);
    823   float b1 = __low2float(b);
    824   float b2 = __high2float(b);
    825   float c1 = __low2float(c);
    826   float c2 = __high2float(c);
    827   float r1 = a1 * b1 + c1;
    828   float r2 = a2 * b2 + c2;
    829   return __floats2half2_rn(r1, r2);
    830 #endif
    831 }
    832 
    833 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
    834                                                  const half2& b) {
    835 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
    836   return __h2div(a, b);
    837 #else
    838   float a1 = __low2float(a);
    839   float a2 = __high2float(a);
    840   float b1 = __low2float(b);
    841   float b2 = __high2float(b);
    842   float r1 = a1 / b1;
    843   float r2 = a2 / b2;
    844   return __floats2half2_rn(r1, r2);
    845 #endif
    846 }
    847 
    848 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
    849                                                  const half2& b) {
    850   float a1 = __low2float(a);
    851   float a2 = __high2float(a);
    852   float b1 = __low2float(b);
    853   float b2 = __high2float(b);
    854   __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
    855   __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
    856   return combine_half(r1, r2);
    857 }
    858 
    859 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
    860                                                  const half2& b) {
    861   float a1 = __low2float(a);
    862   float a2 = __high2float(a);
    863   float b1 = __low2float(b);
    864   float b2 = __high2float(b);
    865   __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
    866   __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
    867   return combine_half(r1, r2);
    868 }
    869 
    870 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
    871 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
    872   return __hadd(__low2half(a), __high2half(a));
    873 #else
    874   float a1 = __low2float(a);
    875   float a2 = __high2float(a);
    876   return Eigen::half(__float2half(a1 + a2));
    877 #endif
    878 }
    879 
    880 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
    881 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
    882   __half first = __low2half(a);
    883   __half second = __high2half(a);
    884   return __hgt(first, second) ? first : second;
    885 #else
    886   float a1 = __low2float(a);
    887   float a2 = __high2float(a);
    888   return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
    889 #endif
    890 }
    891 
    892 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
    893 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
    894   __half first = __low2half(a);
    895   __half second = __high2half(a);
    896   return __hlt(first, second) ? first : second;
    897 #else
    898   float a1 = __low2float(a);
    899   float a2 = __high2float(a);
    900   return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
    901 #endif
    902 }
    903 
    904 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
    905 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
    906   return __hmul(__low2half(a), __high2half(a));
    907 #else
    908   float a1 = __low2float(a);
    909   float a2 = __high2float(a);
    910   return Eigen::half(__float2half(a1 * a2));
    911 #endif
    912 }
    913 
    914 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
    915   float a1 = __low2float(a);
    916   float a2 = __high2float(a);
    917   float r1 = log1pf(a1);
    918   float r2 = log1pf(a2);
    919   return __floats2half2_rn(r1, r2);
    920 }
    921 
    922 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
    923   float a1 = __low2float(a);
    924   float a2 = __high2float(a);
    925   float r1 = expm1f(a1);
    926   float r2 = expm1f(a2);
    927   return __floats2half2_rn(r1, r2);
    928 }
    929 
    930 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
    931   defined(EIGEN_HIP_DEVICE_COMPILE)
    932 
    933 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    934 half2 plog(const half2& a) {
    935   return h2log(a);
    936 }
    937 
    938  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    939 half2 pexp(const half2& a) {
    940   return h2exp(a);
    941 }
    942 
    943  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    944 half2 psqrt(const half2& a) {
    945   return h2sqrt(a);
    946 }
    947 
    948  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    949 half2 prsqrt(const half2& a) {
    950   return h2rsqrt(a);
    951 }
    952 
    953 #else
    954 
    955 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
    956   float a1 = __low2float(a);
    957   float a2 = __high2float(a);
    958   float r1 = logf(a1);
    959   float r2 = logf(a2);
    960   return __floats2half2_rn(r1, r2);
    961 }
    962 
    963 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
    964   float a1 = __low2float(a);
    965   float a2 = __high2float(a);
    966   float r1 = expf(a1);
    967   float r2 = expf(a2);
    968   return __floats2half2_rn(r1, r2);
    969 }
    970 
    971 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
    972   float a1 = __low2float(a);
    973   float a2 = __high2float(a);
    974   float r1 = sqrtf(a1);
    975   float r2 = sqrtf(a2);
    976   return __floats2half2_rn(r1, r2);
    977 }
    978 
    979 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
    980   float a1 = __low2float(a);
    981   float a2 = __high2float(a);
    982   float r1 = rsqrtf(a1);
    983   float r2 = rsqrtf(a2);
    984   return __floats2half2_rn(r1, r2);
    985 }
    986 #endif
    987 } // namespace
    988 
    989 template <>
    990 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
    991 pload<Packet4h2>(const Eigen::half* from) {
    992   return *reinterpret_cast<const Packet4h2*>(from);
    993 }
    994 
    995 // unaligned load;
    996 template <>
    997 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
    998 ploadu<Packet4h2>(const Eigen::half* from) {
    999   Packet4h2 r;
   1000   half2* p_alias = reinterpret_cast<half2*>(&r);
   1001   p_alias[0] = ploadu(from + 0);
   1002   p_alias[1] = ploadu(from + 2);
   1003   p_alias[2] = ploadu(from + 4);
   1004   p_alias[3] = ploadu(from + 6);
   1005   return r;
   1006 }
   1007 
   1008 template <>
   1009 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
   1010 ploaddup<Packet4h2>(const Eigen::half* from) {
   1011   Packet4h2 r;
   1012   half2* p_alias = reinterpret_cast<half2*>(&r);
   1013   p_alias[0] = ploaddup(from + 0);
   1014   p_alias[1] = ploaddup(from + 1);
   1015   p_alias[2] = ploaddup(from + 2);
   1016   p_alias[3] = ploaddup(from + 3);
   1017   return r;
   1018 }
   1019 
   1020 template <>
   1021 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(
   1022     Eigen::half* to, const Packet4h2& from) {
   1023   *reinterpret_cast<Packet4h2*>(to) = from;
   1024 }
   1025 
   1026 template <>
   1027 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
   1028     Eigen::half* to, const Packet4h2& from) {
   1029   const half2* from_alias = reinterpret_cast<const half2*>(&from);
   1030   pstoreu(to + 0,from_alias[0]);
   1031   pstoreu(to + 2,from_alias[1]);
   1032   pstoreu(to + 4,from_alias[2]);
   1033   pstoreu(to + 6,from_alias[3]);
   1034 }
   1035 
   1036 template <>
   1037 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
   1038 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
   1039 #if defined(EIGEN_GPU_HAS_LDG)
   1040   Packet4h2 r;
   1041   r = __ldg(reinterpret_cast<const Packet4h2*>(from));
   1042   return r;
   1043 #else
   1044   Packet4h2 r;
   1045   half2* r_alias = reinterpret_cast<half2*>(&r);
   1046   r_alias[0] = ploadt_ro_aligned(from + 0);
   1047   r_alias[1] = ploadt_ro_aligned(from + 2);
   1048   r_alias[2] = ploadt_ro_aligned(from + 4);
   1049   r_alias[3] = ploadt_ro_aligned(from + 6);
   1050   return r;
   1051 #endif
   1052 }
   1053 
   1054 template <>
   1055 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
   1056 ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
   1057   Packet4h2 r;
   1058   half2* r_alias = reinterpret_cast<half2*>(&r);
   1059   r_alias[0] = ploadt_ro_unaligned(from + 0);
   1060   r_alias[1] = ploadt_ro_unaligned(from + 2);
   1061   r_alias[2] = ploadt_ro_unaligned(from + 4);
   1062   r_alias[3] = ploadt_ro_unaligned(from + 6);
   1063   return r;
   1064 }
   1065 
   1066 template <>
   1067 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
   1068 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
   1069   Packet4h2 r;
   1070   half2* p_alias = reinterpret_cast<half2*>(&r);
   1071   p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
   1072   p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
   1073   p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
   1074   p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
   1075   return r;
   1076 }
   1077 
   1078 template <>
   1079 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
   1080     Eigen::half* to, const Packet4h2& from, Index stride) {
   1081   const half2* from_alias = reinterpret_cast<const half2*>(&from);
   1082   pscatter(to + stride * 0, from_alias[0], stride);
   1083   pscatter(to + stride * 2, from_alias[1], stride);
   1084   pscatter(to + stride * 4, from_alias[2], stride);
   1085   pscatter(to + stride * 6, from_alias[3], stride);
   1086 }
   1087 
   1088 template <>
   1089 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
   1090     const Packet4h2& a) {
   1091   return pfirst(*(reinterpret_cast<const half2*>(&a)));
   1092 }
   1093 
   1094 template <>
   1095 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
   1096     const Packet4h2& a) {
   1097   Packet4h2 r;
   1098   half2* p_alias = reinterpret_cast<half2*>(&r);
   1099   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1100   p_alias[0] = pabs(a_alias[0]);
   1101   p_alias[1] = pabs(a_alias[1]);
   1102   p_alias[2] = pabs(a_alias[2]);
   1103   p_alias[3] = pabs(a_alias[3]);
   1104   return r;
   1105 }
   1106 
   1107 template <>
   1108 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
   1109     const Packet4h2& /*a*/) {
   1110   half true_half = half_impl::raw_uint16_to_half(0xffffu);
   1111   return pset1<Packet4h2>(true_half);
   1112 }
   1113 
   1114 template <>
   1115 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) {
   1116   half false_half = half_impl::raw_uint16_to_half(0x0000u);
   1117   return pset1<Packet4h2>(false_half);
   1118 }
   1119 
   1120 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
   1121     double* d_row0, double* d_row1, double* d_row2, double* d_row3,
   1122     double* d_row4, double* d_row5, double* d_row6, double* d_row7) {
   1123   double d_tmp;
   1124   d_tmp = d_row0[1];
   1125   d_row0[1] = d_row4[0];
   1126   d_row4[0] = d_tmp;
   1127 
   1128   d_tmp = d_row1[1];
   1129   d_row1[1] = d_row5[0];
   1130   d_row5[0] = d_tmp;
   1131 
   1132   d_tmp = d_row2[1];
   1133   d_row2[1] = d_row6[0];
   1134   d_row6[0] = d_tmp;
   1135 
   1136   d_tmp = d_row3[1];
   1137   d_row3[1] = d_row7[0];
   1138   d_row7[0] = d_tmp;
   1139 }
   1140 
   1141 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
   1142     half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
   1143   half2 f_tmp;
   1144   f_tmp = f_row0[1];
   1145   f_row0[1] = f_row2[0];
   1146   f_row2[0] = f_tmp;
   1147 
   1148   f_tmp = f_row1[1];
   1149   f_row1[1] = f_row3[0];
   1150   f_row3[0] = f_tmp;
   1151 }
   1152 
   1153 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
   1154 ptranspose_half(half2& f0, half2& f1) {
   1155   __half a1 = get_half2_low(f0);
   1156   __half a2 = get_half2_high(f0);
   1157   __half b1 = get_half2_low(f1);
   1158   __half b2 = get_half2_high(f1);
   1159   f0 = combine_half(a1, b1);
   1160   f1 = combine_half(a2, b2);
   1161 }
   1162 
   1163 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
   1164 ptranspose(PacketBlock<Packet4h2,8>& kernel) {
   1165   double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
   1166   double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
   1167   double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
   1168   double* d_row3 = reinterpret_cast<double*>(&kernel.packet[3]);
   1169   double* d_row4 = reinterpret_cast<double*>(&kernel.packet[4]);
   1170   double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
   1171   double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
   1172   double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
   1173   ptranspose_double(d_row0, d_row1, d_row2, d_row3,
   1174                     d_row4, d_row5, d_row6, d_row7);
   1175 
   1176 
   1177   half2* f_row0 = reinterpret_cast<half2*>(d_row0);
   1178   half2* f_row1 = reinterpret_cast<half2*>(d_row1);
   1179   half2* f_row2 = reinterpret_cast<half2*>(d_row2);
   1180   half2* f_row3 = reinterpret_cast<half2*>(d_row3);
   1181   ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
   1182   ptranspose_half(f_row0[0], f_row1[0]);
   1183   ptranspose_half(f_row0[1], f_row1[1]);
   1184   ptranspose_half(f_row2[0], f_row3[0]);
   1185   ptranspose_half(f_row2[1], f_row3[1]);
   1186 
   1187   f_row0 = reinterpret_cast<half2*>(d_row0 + 1);
   1188   f_row1 = reinterpret_cast<half2*>(d_row1 + 1);
   1189   f_row2 = reinterpret_cast<half2*>(d_row2 + 1);
   1190   f_row3 = reinterpret_cast<half2*>(d_row3 + 1);
   1191   ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
   1192   ptranspose_half(f_row0[0], f_row1[0]);
   1193   ptranspose_half(f_row0[1], f_row1[1]);
   1194   ptranspose_half(f_row2[0], f_row3[0]);
   1195   ptranspose_half(f_row2[1], f_row3[1]);
   1196 
   1197   f_row0 = reinterpret_cast<half2*>(d_row4);
   1198   f_row1 = reinterpret_cast<half2*>(d_row5);
   1199   f_row2 = reinterpret_cast<half2*>(d_row6);
   1200   f_row3 = reinterpret_cast<half2*>(d_row7);
   1201   ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
   1202   ptranspose_half(f_row0[0], f_row1[0]);
   1203   ptranspose_half(f_row0[1], f_row1[1]);
   1204   ptranspose_half(f_row2[0], f_row3[0]);
   1205   ptranspose_half(f_row2[1], f_row3[1]);
   1206 
   1207   f_row0 = reinterpret_cast<half2*>(d_row4 + 1);
   1208   f_row1 = reinterpret_cast<half2*>(d_row5 + 1);
   1209   f_row2 = reinterpret_cast<half2*>(d_row6 + 1);
   1210   f_row3 = reinterpret_cast<half2*>(d_row7 + 1);
   1211   ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
   1212   ptranspose_half(f_row0[0], f_row1[0]);
   1213   ptranspose_half(f_row0[1], f_row1[1]);
   1214   ptranspose_half(f_row2[0], f_row3[0]);
   1215   ptranspose_half(f_row2[1], f_row3[1]);
   1216 
   1217 }
   1218 
   1219 template <>
   1220 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
   1221 plset<Packet4h2>(const Eigen::half& a) {
   1222 #if defined(EIGEN_HIP_DEVICE_COMPILE)
   1223 
   1224   Packet4h2 r;
   1225   half2* p_alias = reinterpret_cast<half2*>(&r);
   1226   p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
   1227   p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
   1228                               __hadd(a, __float2half(3.0f)));
   1229   p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)),
   1230                               __hadd(a, __float2half(5.0f)));
   1231   p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
   1232                               __hadd(a, __float2half(7.0f)));
   1233   return r;
   1234 #elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
   1235   Packet4h2 r;
   1236   half2* r_alias = reinterpret_cast<half2*>(&r);
   1237 
   1238   half2 b = pset1<half2>(a);
   1239   half2 c;
   1240   half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
   1241   half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
   1242 
   1243   c = __hadd2(b, half_offset0);
   1244   r_alias[0] = plset(__low2half(c));
   1245   r_alias[1] = plset(__high2half(c));
   1246 
   1247   c = __hadd2(b, half_offset1);
   1248   r_alias[2] = plset(__low2half(c));
   1249   r_alias[3] = plset(__high2half(c));
   1250 
   1251   return r;
   1252 
   1253 #else
   1254   float f = __half2float(a);
   1255   Packet4h2 r;
   1256   half2* p_alias = reinterpret_cast<half2*>(&r);
   1257   p_alias[0] = combine_half(a, __float2half(f + 1.0f));
   1258   p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
   1259   p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
   1260   p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
   1261   return r;
   1262 #endif
   1263 }
   1264 
   1265 template <>
   1266 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
   1267 pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
   1268                    const Packet4h2& b) {
   1269   Packet4h2 r;
   1270   half2* r_alias = reinterpret_cast<half2*>(&r);
   1271   const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
   1272   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1273   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1274   r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
   1275   r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
   1276   r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
   1277   r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
   1278   return r;
   1279 }
   1280 
   1281 template <>
   1282 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
   1283 pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   1284   Packet4h2 r;
   1285   half2* r_alias = reinterpret_cast<half2*>(&r);
   1286   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1287   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1288   r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
   1289   r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
   1290   r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
   1291   r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
   1292   return r;
   1293 }
   1294 
   1295 template <>
   1296 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
   1297     const Packet4h2& a, const Packet4h2& b) {
   1298   Packet4h2 r;
   1299   half2* r_alias = reinterpret_cast<half2*>(&r);
   1300   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1301   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1302   r_alias[0] = pand(a_alias[0], b_alias[0]);
   1303   r_alias[1] = pand(a_alias[1], b_alias[1]);
   1304   r_alias[2] = pand(a_alias[2], b_alias[2]);
   1305   r_alias[3] = pand(a_alias[3], b_alias[3]);
   1306   return r;
   1307 }
   1308 
   1309 template <>
   1310 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
   1311     const Packet4h2& a, const Packet4h2& b) {
   1312   Packet4h2 r;
   1313   half2* r_alias = reinterpret_cast<half2*>(&r);
   1314   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1315   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1316   r_alias[0] = por(a_alias[0], b_alias[0]);
   1317   r_alias[1] = por(a_alias[1], b_alias[1]);
   1318   r_alias[2] = por(a_alias[2], b_alias[2]);
   1319   r_alias[3] = por(a_alias[3], b_alias[3]);
   1320   return r;
   1321 }
   1322 
   1323 template <>
   1324 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
   1325     const Packet4h2& a, const Packet4h2& b) {
   1326   Packet4h2 r;
   1327   half2* r_alias = reinterpret_cast<half2*>(&r);
   1328   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1329   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1330   r_alias[0] = pxor(a_alias[0], b_alias[0]);
   1331   r_alias[1] = pxor(a_alias[1], b_alias[1]);
   1332   r_alias[2] = pxor(a_alias[2], b_alias[2]);
   1333   r_alias[3] = pxor(a_alias[3], b_alias[3]);
   1334   return r;
   1335 }
   1336 
   1337 template <>
   1338 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
   1339 pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
   1340   Packet4h2 r;
   1341   half2* r_alias = reinterpret_cast<half2*>(&r);
   1342   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1343   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1344   r_alias[0] = pandnot(a_alias[0], b_alias[0]);
   1345   r_alias[1] = pandnot(a_alias[1], b_alias[1]);
   1346   r_alias[2] = pandnot(a_alias[2], b_alias[2]);
   1347   r_alias[3] = pandnot(a_alias[3], b_alias[3]);
   1348   return r;
   1349 }
   1350 
   1351 template <>
   1352 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
   1353     const Packet4h2& a, const Packet4h2& b) {
   1354   Packet4h2 r;
   1355   half2* r_alias = reinterpret_cast<half2*>(&r);
   1356   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1357   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1358   r_alias[0] = padd(a_alias[0], b_alias[0]);
   1359   r_alias[1] = padd(a_alias[1], b_alias[1]);
   1360   r_alias[2] = padd(a_alias[2], b_alias[2]);
   1361   r_alias[3] = padd(a_alias[3], b_alias[3]);
   1362   return r;
   1363 }
   1364 
   1365 template <>
   1366 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
   1367     const Packet4h2& a, const Packet4h2& b) {
   1368   Packet4h2 r;
   1369   half2* r_alias = reinterpret_cast<half2*>(&r);
   1370   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1371   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1372   r_alias[0] = psub(a_alias[0], b_alias[0]);
   1373   r_alias[1] = psub(a_alias[1], b_alias[1]);
   1374   r_alias[2] = psub(a_alias[2], b_alias[2]);
   1375   r_alias[3] = psub(a_alias[3], b_alias[3]);
   1376   return r;
   1377 }
   1378 
   1379 template <>
   1380 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) {
   1381   Packet4h2 r;
   1382   half2* r_alias = reinterpret_cast<half2*>(&r);
   1383   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1384   r_alias[0] = pnegate(a_alias[0]);
   1385   r_alias[1] = pnegate(a_alias[1]);
   1386   r_alias[2] = pnegate(a_alias[2]);
   1387   r_alias[3] = pnegate(a_alias[3]);
   1388   return r;
   1389 }
   1390 
   1391 template <>
   1392 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
   1393   return a;
   1394 }
   1395 
   1396 template <>
   1397 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
   1398     const Packet4h2& a, const Packet4h2& b) {
   1399   Packet4h2 r;
   1400   half2* r_alias = reinterpret_cast<half2*>(&r);
   1401   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1402   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1403   r_alias[0] = pmul(a_alias[0], b_alias[0]);
   1404   r_alias[1] = pmul(a_alias[1], b_alias[1]);
   1405   r_alias[2] = pmul(a_alias[2], b_alias[2]);
   1406   r_alias[3] = pmul(a_alias[3], b_alias[3]);
   1407   return r;
   1408 }
   1409 
   1410 template <>
   1411 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
   1412     const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) {
   1413   Packet4h2 r;
   1414   half2* r_alias = reinterpret_cast<half2*>(&r);
   1415   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1416   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1417   const half2* c_alias = reinterpret_cast<const half2*>(&c);
   1418   r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
   1419   r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
   1420   r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
   1421   r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
   1422   return r;
   1423 }
   1424 
   1425 template <>
   1426 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
   1427     const Packet4h2& a, const Packet4h2& b) {
   1428   Packet4h2 r;
   1429   half2* r_alias = reinterpret_cast<half2*>(&r);
   1430   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1431   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1432   r_alias[0] = pdiv(a_alias[0], b_alias[0]);
   1433   r_alias[1] = pdiv(a_alias[1], b_alias[1]);
   1434   r_alias[2] = pdiv(a_alias[2], b_alias[2]);
   1435   r_alias[3] = pdiv(a_alias[3], b_alias[3]);
   1436   return r;
   1437 }
   1438 
   1439 template <>
   1440 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
   1441     const Packet4h2& a, const Packet4h2& b) {
   1442   Packet4h2 r;
   1443   half2* r_alias = reinterpret_cast<half2*>(&r);
   1444   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1445   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1446   r_alias[0] = pmin(a_alias[0], b_alias[0]);
   1447   r_alias[1] = pmin(a_alias[1], b_alias[1]);
   1448   r_alias[2] = pmin(a_alias[2], b_alias[2]);
   1449   r_alias[3] = pmin(a_alias[3], b_alias[3]);
   1450   return r;
   1451 }
   1452 
   1453 template <>
   1454 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
   1455     const Packet4h2& a, const Packet4h2& b) {
   1456   Packet4h2 r;
   1457   half2* r_alias = reinterpret_cast<half2*>(&r);
   1458   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1459   const half2* b_alias = reinterpret_cast<const half2*>(&b);
   1460   r_alias[0] = pmax(a_alias[0], b_alias[0]);
   1461   r_alias[1] = pmax(a_alias[1], b_alias[1]);
   1462   r_alias[2] = pmax(a_alias[2], b_alias[2]);
   1463   r_alias[3] = pmax(a_alias[3], b_alias[3]);
   1464   return r;
   1465 }
   1466 
   1467 template <>
   1468 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
   1469     const Packet4h2& a) {
   1470   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1471 
   1472   return predux(a_alias[0]) + predux(a_alias[1]) +
   1473          predux(a_alias[2]) + predux(a_alias[3]);
   1474 }
   1475 
   1476 template <>
   1477 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
   1478     const Packet4h2& a) {
   1479   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1480   half2 m0 = combine_half(predux_max(a_alias[0]),
   1481                             predux_max(a_alias[1]));
   1482   half2 m1 = combine_half(predux_max(a_alias[2]),
   1483                             predux_max(a_alias[3]));
   1484   __half first  = predux_max(m0);
   1485   __half second = predux_max(m1);
   1486 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
   1487   return (__hgt(first, second) ? first : second);
   1488 #else
   1489   float ffirst  = __half2float(first);
   1490   float fsecond = __half2float(second);
   1491   return (ffirst > fsecond)? first: second;
   1492 #endif
   1493 }
   1494 
   1495 template <>
   1496 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
   1497     const Packet4h2& a) {
   1498   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1499   half2 m0 = combine_half(predux_min(a_alias[0]),
   1500                             predux_min(a_alias[1]));
   1501   half2 m1 = combine_half(predux_min(a_alias[2]),
   1502                             predux_min(a_alias[3]));
   1503   __half first  = predux_min(m0);
   1504   __half second = predux_min(m1);
   1505 #if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
   1506   return (__hlt(first, second) ? first : second);
   1507 #else
   1508   float ffirst  = __half2float(first);
   1509   float fsecond = __half2float(second);
   1510   return (ffirst < fsecond)? first: second;
   1511 #endif
   1512 }
   1513 
   1514 // likely overflow/underflow
   1515 template <>
   1516 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
   1517     const Packet4h2& a) {
   1518   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1519   return predux_mul(pmul(pmul(a_alias[0], a_alias[1]),
   1520                                        pmul(a_alias[2], a_alias[3])));
   1521 }
   1522 
   1523 template <>
   1524 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
   1525 plog1p<Packet4h2>(const Packet4h2& a) {
   1526   Packet4h2 r;
   1527   half2* r_alias = reinterpret_cast<half2*>(&r);
   1528   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1529   r_alias[0] = plog1p(a_alias[0]);
   1530   r_alias[1] = plog1p(a_alias[1]);
   1531   r_alias[2] = plog1p(a_alias[2]);
   1532   r_alias[3] = plog1p(a_alias[3]);
   1533   return r;
   1534 }
   1535 
   1536 template <>
   1537 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
   1538 pexpm1<Packet4h2>(const Packet4h2& a) {
   1539   Packet4h2 r;
   1540   half2* r_alias = reinterpret_cast<half2*>(&r);
   1541   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1542   r_alias[0] = pexpm1(a_alias[0]);
   1543   r_alias[1] = pexpm1(a_alias[1]);
   1544   r_alias[2] = pexpm1(a_alias[2]);
   1545   r_alias[3] = pexpm1(a_alias[3]);
   1546   return r;
   1547 }
   1548 
   1549 template <>
   1550 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(const Packet4h2& a) {
   1551   Packet4h2 r;
   1552   half2* r_alias = reinterpret_cast<half2*>(&r);
   1553   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1554   r_alias[0] = plog(a_alias[0]);
   1555   r_alias[1] = plog(a_alias[1]);
   1556   r_alias[2] = plog(a_alias[2]);
   1557   r_alias[3] = plog(a_alias[3]);
   1558   return r;
   1559 }
   1560 
   1561 template <>
   1562 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(const Packet4h2& a) {
   1563   Packet4h2 r;
   1564   half2* r_alias = reinterpret_cast<half2*>(&r);
   1565   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1566   r_alias[0] = pexp(a_alias[0]);
   1567   r_alias[1] = pexp(a_alias[1]);
   1568   r_alias[2] = pexp(a_alias[2]);
   1569   r_alias[3] = pexp(a_alias[3]);
   1570   return r;
   1571 }
   1572 
   1573 template <>
   1574 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2& a) {
   1575   Packet4h2 r;
   1576   half2* r_alias = reinterpret_cast<half2*>(&r);
   1577   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1578   r_alias[0] = psqrt(a_alias[0]);
   1579   r_alias[1] = psqrt(a_alias[1]);
   1580   r_alias[2] = psqrt(a_alias[2]);
   1581   r_alias[3] = psqrt(a_alias[3]);
   1582   return r;
   1583 }
   1584 
   1585 template <>
   1586 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
   1587 prsqrt<Packet4h2>(const Packet4h2& a) {
   1588   Packet4h2 r;
   1589   half2* r_alias = reinterpret_cast<half2*>(&r);
   1590   const half2* a_alias = reinterpret_cast<const half2*>(&a);
   1591   r_alias[0] = prsqrt(a_alias[0]);
   1592   r_alias[1] = prsqrt(a_alias[1]);
   1593   r_alias[2] = prsqrt(a_alias[2]);
   1594   r_alias[3] = prsqrt(a_alias[3]);
   1595   return r;
   1596 }
   1597 
   1598 // The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
   1599 // the implementation of GPU half reduction.
   1600 template<>
   1601 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
   1602                                                         const half2& b) {
   1603 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   1604   return __hadd2(a, b);
   1605 #else
   1606   float a1 = __low2float(a);
   1607   float a2 = __high2float(a);
   1608   float b1 = __low2float(b);
   1609   float b2 = __high2float(b);
   1610   float r1 = a1 + b1;
   1611   float r2 = a2 + b2;
   1612   return __floats2half2_rn(r1, r2);
   1613 #endif
   1614 }
   1615 
   1616 template<>
   1617 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
   1618                                                         const half2& b) {
   1619 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   1620   return __hmul2(a, b);
   1621 #else
   1622   float a1 = __low2float(a);
   1623   float a2 = __high2float(a);
   1624   float b1 = __low2float(b);
   1625   float b2 = __high2float(b);
   1626   float r1 = a1 * b1;
   1627   float r2 = a2 * b2;
   1628   return __floats2half2_rn(r1, r2);
   1629 #endif
   1630 }
   1631 
   1632 template<>
   1633 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
   1634                                                         const half2& b) {
   1635 #if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   1636   return __h2div(a, b);
   1637 #else
   1638   float a1 = __low2float(a);
   1639   float a2 = __high2float(a);
   1640   float b1 = __low2float(b);
   1641   float b2 = __high2float(b);
   1642   float r1 = a1 / b1;
   1643   float r2 = a2 / b2;
   1644   return __floats2half2_rn(r1, r2);
   1645 #endif
   1646 }
   1647 
   1648 template<>
   1649 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
   1650                                                         const half2& b) {
   1651   float a1 = __low2float(a);
   1652   float a2 = __high2float(a);
   1653   float b1 = __low2float(b);
   1654   float b2 = __high2float(b);
   1655   __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
   1656   __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
   1657   return combine_half(r1, r2);
   1658 }
   1659 
   1660 template<>
   1661 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
   1662                                                         const half2& b) {
   1663   float a1 = __low2float(a);
   1664   float a2 = __high2float(a);
   1665   float b1 = __low2float(b);
   1666   float b2 = __high2float(b);
   1667   __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
   1668   __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
   1669   return combine_half(r1, r2);
   1670 }
   1671 
   1672 // #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
   1673 
   1674 #endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
   1675 
   1676 #undef EIGEN_GPU_HAS_LDG
   1677 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
   1678 #undef EIGEN_GPU_HAS_FP16_ARITHMETIC
   1679 
   1680 } // end namespace internal
   1681 
   1682 } // end namespace Eigen
   1683 
   1684 
   1685 #endif // EIGEN_PACKET_MATH_GPU_H
	cart-elc Source code for CART-ELC
	git clone git://git.laack.co/cart-elc.git
	Log \| Files \| Refs \| README \| LICENSE