cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

PacketMath.h (189525B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
      5 // Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
      6 // Heavily based on Gael's SSE version.
      7 //
      8 // This Source Code Form is subject to the terms of the Mozilla
      9 // Public License v. 2.0. If a copy of the MPL was not distributed
     10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     11 
     12 #ifndef EIGEN_PACKET_MATH_NEON_H
     13 #define EIGEN_PACKET_MATH_NEON_H
     14 
     15 namespace Eigen {
     16 
     17 namespace internal {
     18 
     19 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
     20 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
     21 #endif
     22 
     23 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     24 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     25 #endif
     26 
     27 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
     28 #if EIGEN_ARCH_ARM64
     29 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
     30 #else
     31 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
     32 #endif
     33 #endif
     34 
     35 #if EIGEN_COMP_MSVC_STRICT
     36 
     37 // In MSVC's arm_neon.h header file, all NEON vector types
     38 // are aliases to the same underlying type __n128.
     39 // We thus have to wrap them to make them different C++ types.
     40 // (See also bug 1428)
     41 typedef eigen_packet_wrapper<float32x2_t,0>  Packet2f;
     42 typedef eigen_packet_wrapper<float32x4_t,1>  Packet4f;
     43 typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
     44 typedef eigen_packet_wrapper<int8x8_t   ,3>  Packet8c;
     45 typedef eigen_packet_wrapper<int8x16_t  ,4>  Packet16c;
     46 typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
     47 typedef eigen_packet_wrapper<uint8x8_t  ,6>  Packet8uc;
     48 typedef eigen_packet_wrapper<uint8x16_t ,7>  Packet16uc;
     49 typedef eigen_packet_wrapper<int16x4_t  ,8>  Packet4s;
     50 typedef eigen_packet_wrapper<int16x8_t  ,9>  Packet8s;
     51 typedef eigen_packet_wrapper<uint16x4_t ,10> Packet4us;
     52 typedef eigen_packet_wrapper<uint16x8_t ,11> Packet8us;
     53 typedef eigen_packet_wrapper<int32x2_t  ,12> Packet2i;
     54 typedef eigen_packet_wrapper<int32x4_t  ,13> Packet4i;
     55 typedef eigen_packet_wrapper<uint32x2_t ,14> Packet2ui;
     56 typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
     57 typedef eigen_packet_wrapper<int64x2_t  ,16> Packet2l;
     58 typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
     59 
     60 #else
     61 
     62 typedef float32x2_t                          Packet2f;
     63 typedef float32x4_t                          Packet4f;
     64 typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
     65 typedef int8x8_t                             Packet8c;
     66 typedef int8x16_t                            Packet16c;
     67 typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
     68 typedef uint8x8_t                            Packet8uc;
     69 typedef uint8x16_t                           Packet16uc;
     70 typedef int16x4_t                            Packet4s;
     71 typedef int16x8_t                            Packet8s;
     72 typedef uint16x4_t                           Packet4us;
     73 typedef uint16x8_t                           Packet8us;
     74 typedef int32x2_t                            Packet2i;
     75 typedef int32x4_t                            Packet4i;
     76 typedef uint32x2_t                           Packet2ui;
     77 typedef uint32x4_t                           Packet4ui;
     78 typedef int64x2_t                            Packet2l;
     79 typedef uint64x2_t                           Packet2ul;
     80 
     81 #endif // EIGEN_COMP_MSVC_STRICT
     82 
     83 EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
     84   const float* a = reinterpret_cast<const float*>(&m);
     85   Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))};
     86   return res;
     87 }
     88 
     89 // fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
     90 // == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
     91 // interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
     92 // to enable a shared implementation for fast inversion of matrices of size 4. 
     93 template<bool interleave> 
     94 EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask)
     95 {
     96   const float* a = reinterpret_cast<const float*>(&m);
     97   const float* b = reinterpret_cast<const float*>(&n);
     98   Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
     99   return res;
    100 }
    101 
    102 template<> 
    103 EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n, int mask) 
    104 {
    105   const float* a = reinterpret_cast<const float*>(&m);
    106   const float* b = reinterpret_cast<const float*>(&n);
    107   Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
    108   return res;
    109 }
    110 
    111 EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
    112 
    113 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s)
    114 { 
    115   return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
    116 }
    117 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
    118 { 
    119   return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
    120 }
    121 EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
    122 {
    123   return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
    124 }
    125 EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
    126 {
    127   return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
    128 }
    129 EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
    130 {
    131   return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
    132 }
    133 EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
    134 {
    135   return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
    136 }
    137 #define vec4f_duplane(a, p) \
    138   vdupq_lane_f32(vget_low_f32(a), p)
    139 
    140 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
    141   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
    142 
    143 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
    144   const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
    145 
    146 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
    147   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
    148 
    149 #if EIGEN_ARCH_ARM64
    150   // __builtin_prefetch tends to do nothing on ARM64 compilers because the
    151   // prefetch instructions there are too detailed for __builtin_prefetch to map
    152   // meaningfully to them.
    153   #define EIGEN_ARM_PREFETCH(ADDR)  __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
    154 #elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
    155   #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
    156 #elif defined __pld
    157   #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
    158 #elif EIGEN_ARCH_ARM32
    159   #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
    160 #else
    161   // by default no explicit prefetching
    162   #define EIGEN_ARM_PREFETCH(ADDR)
    163 #endif
    164 
    165 template <>
    166 struct packet_traits<float> : default_packet_traits
    167 {
    168   typedef Packet4f type;
    169   typedef Packet2f half;
    170   enum
    171   {
    172     Vectorizable = 1,
    173     AlignedOnScalar = 1,
    174     size = 4,
    175     HasHalfPacket = 1,
    176 
    177     HasAdd       = 1,
    178     HasSub       = 1,
    179     HasShift     = 1,
    180     HasMul       = 1,
    181     HasNegate    = 1,
    182     HasAbs       = 1,
    183     HasArg       = 0,
    184     HasAbs2      = 1,
    185     HasAbsDiff   = 1,
    186     HasMin       = 1,
    187     HasMax       = 1,
    188     HasConj      = 1,
    189     HasSetLinear = 0,
    190     HasBlend     = 0,
    191 
    192     HasDiv   = 1,
    193     HasFloor = 1,
    194     HasCeil = 1,
    195     HasRint = 1,
    196 
    197     HasSin  = EIGEN_FAST_MATH,
    198     HasCos  = EIGEN_FAST_MATH,
    199     HasLog  = 1,
    200     HasExp  = 1,
    201     HasSqrt = 1,
    202     HasRsqrt = 1,
    203     HasTanh = EIGEN_FAST_MATH,
    204     HasErf  = EIGEN_FAST_MATH,
    205     HasBessel = 0,  // Issues with accuracy.
    206     HasNdtri = 0
    207   };
    208 };
    209 
    210 template <>
    211 struct packet_traits<int8_t> : default_packet_traits
    212 {
    213   typedef Packet16c type;
    214   typedef Packet8c half;
    215   enum
    216   {
    217     Vectorizable = 1,
    218     AlignedOnScalar = 1,
    219     size = 16,
    220     HasHalfPacket = 1,
    221 
    222     HasAdd       = 1,
    223     HasSub       = 1,
    224     HasShift     = 1,
    225     HasMul       = 1,
    226     HasNegate    = 1,
    227     HasAbs       = 1,
    228     HasAbsDiff   = 1,
    229     HasArg       = 0,
    230     HasAbs2      = 1,
    231     HasMin       = 1,
    232     HasMax       = 1,
    233     HasConj      = 1,
    234     HasSetLinear = 0,
    235     HasBlend     = 0
    236   };
    237 };
    238 
    239 template <>
    240 struct packet_traits<uint8_t> : default_packet_traits
    241 {
    242   typedef Packet16uc type;
    243   typedef Packet8uc half;
    244   enum
    245   {
    246     Vectorizable = 1,
    247     AlignedOnScalar = 1,
    248     size = 16,
    249     HasHalfPacket = 1,
    250 
    251     HasAdd       = 1,
    252     HasSub       = 1,
    253     HasShift     = 1,
    254     HasMul       = 1,
    255     HasNegate    = 0,
    256     HasAbs       = 1,
    257     HasAbsDiff   = 1,
    258     HasArg       = 0,
    259     HasAbs2      = 1,
    260     HasMin       = 1,
    261     HasMax       = 1,
    262     HasConj      = 1,
    263     HasSetLinear = 0,
    264     HasBlend     = 0,
    265 
    266     HasSqrt = 1
    267   };
    268 };
    269 
    270 template <>
    271 struct packet_traits<int16_t> : default_packet_traits
    272 {
    273   typedef Packet8s type;
    274   typedef Packet4s half;
    275   enum
    276   {
    277     Vectorizable = 1,
    278     AlignedOnScalar = 1,
    279     size = 8,
    280     HasHalfPacket = 1,
    281 
    282     HasAdd       = 1,
    283     HasSub       = 1,
    284     HasShift     = 1,
    285     HasMul       = 1,
    286     HasNegate    = 1,
    287     HasAbs       = 1,
    288     HasAbsDiff   = 1,
    289     HasArg       = 0,
    290     HasAbs2      = 1,
    291     HasMin       = 1,
    292     HasMax       = 1,
    293     HasConj      = 1,
    294     HasSetLinear = 0,
    295     HasBlend     = 0
    296   };
    297 };
    298 
    299 template <>
    300 struct packet_traits<uint16_t> : default_packet_traits
    301 {
    302   typedef Packet8us type;
    303   typedef Packet4us half;
    304   enum
    305   {
    306     Vectorizable = 1,
    307     AlignedOnScalar = 1,
    308     size = 8,
    309     HasHalfPacket = 1,
    310 
    311     HasAdd       = 1,
    312     HasSub       = 1,
    313     HasShift     = 1,
    314     HasMul       = 1,
    315     HasNegate    = 0,
    316     HasAbs       = 0,
    317     HasAbsDiff   = 1,
    318     HasArg       = 0,
    319     HasAbs2      = 1,
    320     HasMin       = 1,
    321     HasMax       = 1,
    322     HasConj      = 1,
    323     HasSetLinear = 0,
    324     HasBlend     = 0,
    325     HasSqrt = 1
    326   };
    327 };
    328 
    329 template <>
    330 struct packet_traits<int32_t> : default_packet_traits
    331 {
    332   typedef Packet4i type;
    333   typedef Packet2i half;
    334   enum
    335   {
    336     Vectorizable = 1,
    337     AlignedOnScalar = 1,
    338     size = 4,
    339     HasHalfPacket = 1,
    340 
    341     HasAdd       = 1,
    342     HasSub       = 1,
    343     HasShift     = 1,
    344     HasMul       = 1,
    345     HasNegate    = 1,
    346     HasAbs       = 1,
    347     HasArg       = 0,
    348     HasAbs2      = 1,
    349     HasAbsDiff   = 1,
    350     HasMin       = 1,
    351     HasMax       = 1,
    352     HasConj      = 1,
    353     HasSetLinear = 0,
    354     HasBlend     = 0
    355   };
    356 };
    357 
    358 template <>
    359 struct packet_traits<uint32_t> : default_packet_traits
    360 {
    361   typedef Packet4ui type;
    362   typedef Packet2ui half;
    363   enum
    364   {
    365     Vectorizable = 1,
    366     AlignedOnScalar = 1,
    367     size = 4,
    368     HasHalfPacket = 1,
    369 
    370     HasAdd       = 1,
    371     HasSub       = 1,
    372     HasShift     = 1,
    373     HasMul       = 1,
    374     HasNegate    = 0,
    375     HasAbs       = 0,
    376     HasArg       = 0,
    377     HasAbs2      = 1,
    378     HasAbsDiff   = 1,
    379     HasMin       = 1,
    380     HasMax       = 1,
    381     HasConj      = 1,
    382     HasSetLinear = 0,
    383     HasBlend     = 0,
    384 
    385     HasSqrt = 1
    386   };
    387 };
    388 
    389 template <>
    390 struct packet_traits<int64_t> : default_packet_traits
    391 {
    392   typedef Packet2l type;
    393   typedef Packet2l half;
    394   enum
    395   {
    396     Vectorizable = 1,
    397     AlignedOnScalar = 1,
    398     size = 2,
    399     HasHalfPacket = 0,
    400 
    401     HasCmp       = 1,
    402     HasAdd       = 1,
    403     HasSub       = 1,
    404     HasShift     = 1,
    405     HasMul       = 1,
    406     HasNegate    = 1,
    407     HasAbs       = 1,
    408     HasArg       = 0,
    409     HasAbs2      = 1,
    410     HasAbsDiff   = 1,
    411     HasMin       = 1,
    412     HasMax       = 1,
    413     HasConj      = 1,
    414     HasSetLinear = 0,
    415     HasBlend     = 0
    416   };
    417 };
    418 
    419 template <>
    420 struct packet_traits<uint64_t> : default_packet_traits
    421 {
    422   typedef Packet2ul type;
    423   typedef Packet2ul half;
    424   enum
    425   {
    426     Vectorizable = 1,
    427     AlignedOnScalar = 1,
    428     size = 2,
    429     HasHalfPacket = 0,
    430 
    431     HasCmp       = 1,
    432     HasAdd       = 1,
    433     HasSub       = 1,
    434     HasShift     = 1,
    435     HasMul       = 1,
    436     HasNegate    = 0,
    437     HasAbs       = 0,
    438     HasArg       = 0,
    439     HasAbs2      = 1,
    440     HasAbsDiff   = 1,
    441     HasMin       = 1,
    442     HasMax       = 1,
    443     HasConj      = 1,
    444     HasSetLinear = 0,
    445     HasBlend     = 0
    446   };
    447 };
    448 
    449 #if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM
    450 // workaround gcc 4.2, 4.3 and 4.4 compilation issue
    451 EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
    452 EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); }
    453 EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
    454 EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
    455 EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
    456 #endif
    457 
    458 template<> struct unpacket_traits<Packet2f>
    459 {
    460   typedef float type;
    461   typedef Packet2f half;
    462   typedef Packet2i integer_packet;
    463   enum
    464   {
    465     size = 2,
    466     alignment = Aligned16,
    467     vectorizable = true,
    468     masked_load_available = false,
    469     masked_store_available = false
    470   };
    471 };
    472 template<> struct unpacket_traits<Packet4f>
    473 {
    474   typedef float type;
    475   typedef Packet2f half;
    476   typedef Packet4i integer_packet;
    477   enum
    478   {
    479     size = 4,
    480     alignment = Aligned16,
    481     vectorizable = true,
    482     masked_load_available = false,
    483     masked_store_available = false
    484   };
    485 };
    486 template<> struct unpacket_traits<Packet4c>
    487 {
    488   typedef int8_t type;
    489   typedef Packet4c half;
    490   enum
    491   {
    492     size = 4,
    493     alignment = Unaligned,
    494     vectorizable = true,
    495     masked_load_available = false,
    496     masked_store_available = false
    497   };
    498 };
    499 template<> struct unpacket_traits<Packet8c>
    500 {
    501   typedef int8_t type;
    502   typedef Packet4c half;
    503   enum
    504   {
    505     size = 8,
    506     alignment = Aligned16,
    507     vectorizable = true,
    508     masked_load_available = false,
    509     masked_store_available = false
    510   };
    511 };
    512 template<> struct unpacket_traits<Packet16c>
    513 {
    514   typedef int8_t type;
    515   typedef Packet8c half;
    516   enum
    517   {
    518     size = 16,
    519     alignment = Aligned16,
    520     vectorizable = true,
    521     masked_load_available = false,
    522     masked_store_available = false
    523   };
    524 };
    525 template<> struct unpacket_traits<Packet4uc>
    526 {
    527   typedef uint8_t type;
    528   typedef Packet4uc half;
    529   enum
    530   {
    531     size = 4,
    532     alignment = Unaligned,
    533     vectorizable = true,
    534     masked_load_available = false,
    535     masked_store_available = false
    536   };
    537 };
    538 template<> struct unpacket_traits<Packet8uc>
    539 {
    540   typedef uint8_t type;
    541   typedef Packet4uc half;
    542   enum
    543   {
    544     size = 8,
    545     alignment = Aligned16,
    546     vectorizable = true,
    547     masked_load_available = false,
    548     masked_store_available = false
    549   };
    550 };
    551 template<> struct unpacket_traits<Packet16uc>
    552 {
    553   typedef uint8_t type;
    554   typedef Packet8uc half;
    555   enum
    556   {
    557     size = 16,
    558     alignment = Aligned16,
    559     vectorizable = true,
    560     masked_load_available = false,
    561     masked_store_available = false};
    562 };
    563 template<> struct unpacket_traits<Packet4s>
    564 {
    565   typedef int16_t type;
    566   typedef Packet4s half;
    567   enum
    568   {
    569     size = 4,
    570     alignment = Aligned16,
    571     vectorizable = true,
    572     masked_load_available = false,
    573     masked_store_available = false
    574   };
    575 };
    576 template<> struct unpacket_traits<Packet8s>
    577 {
    578   typedef int16_t type;
    579   typedef Packet4s half;
    580   enum
    581   {
    582     size = 8,
    583     alignment = Aligned16,
    584     vectorizable = true,
    585     masked_load_available = false,
    586     masked_store_available = false
    587   };
    588 };
    589 template<> struct unpacket_traits<Packet4us>
    590 {
    591   typedef uint16_t type;
    592   typedef Packet4us half;
    593   enum
    594   {
    595     size = 4,
    596     alignment = Aligned16,
    597     vectorizable = true,
    598     masked_load_available = false,
    599     masked_store_available = false
    600   };
    601 };
    602 template<> struct unpacket_traits<Packet8us>
    603 {
    604   typedef uint16_t type;
    605   typedef Packet4us half;
    606   enum
    607   {
    608     size = 8,
    609     alignment = Aligned16,
    610     vectorizable = true,
    611     masked_load_available = false,
    612     masked_store_available = false
    613   };
    614 };
    615 template<> struct unpacket_traits<Packet2i>
    616 {
    617   typedef int32_t type;
    618   typedef Packet2i half;
    619   enum
    620   {
    621     size = 2,
    622     alignment = Aligned16,
    623     vectorizable = true,
    624     masked_load_available = false,
    625     masked_store_available = false
    626   };
    627 };
    628 template<> struct unpacket_traits<Packet4i>
    629 {
    630   typedef int32_t type;
    631   typedef Packet2i half;
    632   enum
    633   {
    634     size = 4,
    635     alignment = Aligned16,
    636     vectorizable = true,
    637     masked_load_available = false,
    638     masked_store_available = false
    639   };
    640 };
    641 template<> struct unpacket_traits<Packet2ui>
    642 {
    643   typedef uint32_t type;
    644   typedef Packet2ui half;
    645   enum
    646   {
    647     size = 2,
    648     alignment = Aligned16,
    649     vectorizable = true,
    650     masked_load_available = false,
    651     masked_store_available = false
    652   };
    653 };
    654 template<> struct unpacket_traits<Packet4ui>
    655 {
    656   typedef uint32_t type;
    657   typedef Packet2ui half;
    658   enum
    659   {
    660     size = 4,
    661     alignment = Aligned16,
    662     vectorizable = true,
    663     masked_load_available = false,
    664     masked_store_available = false
    665   };
    666 };
    667 template<> struct unpacket_traits<Packet2l>
    668 {
    669   typedef int64_t type;
    670   typedef Packet2l half;
    671   enum
    672   {
    673     size = 2,
    674     alignment = Aligned16,
    675     vectorizable = true,
    676     masked_load_available = false,
    677     masked_store_available = false
    678   };
    679 };
    680 template<> struct unpacket_traits<Packet2ul>
    681 {
    682   typedef uint64_t type;
    683   typedef Packet2ul half;
    684   enum
    685   {
    686     size = 2,
    687     alignment = Aligned16,
    688     vectorizable = true,
    689     masked_load_available = false,
    690     masked_store_available = false
    691   };
    692 };
    693 
    694 template<> EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) { return vdup_n_f32(from); }
    695 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
    696 template<> EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from)
    697 { return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); }
    698 template<> EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) { return vdup_n_s8(from); }
    699 template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) { return vdupq_n_s8(from); }
    700 template<> EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from)
    701 { return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); }
    702 template<> EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) { return vdup_n_u8(from); }
    703 template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) { return vdupq_n_u8(from); }
    704 template<> EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) { return vdup_n_s16(from); }
    705 template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) { return vdupq_n_s16(from); }
    706 template<> EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) { return vdup_n_u16(from); }
    707 template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) { return vdupq_n_u16(from); }
    708 template<> EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) { return vdup_n_s32(from); }
    709 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
    710 template<> EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) { return vdup_n_u32(from); }
    711 template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return vdupq_n_u32(from); }
    712 template<> EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) { return vdupq_n_s64(from); }
    713 template<> EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) { return vdupq_n_u64(from); }
    714 
    715 template<> EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(unsigned int from)
    716 { return vreinterpret_f32_u32(vdup_n_u32(from)); }
    717 template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from)
    718 { return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
    719 
    720 template<> EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a)
    721 {
    722   const float c[] = {0.0f,1.0f};
    723   return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
    724 }
    725 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
    726 {
    727   const float c[] = {0.0f,1.0f,2.0f,3.0f};
    728   return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
    729 }
    730 template<> EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a)
    731 { return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); }
    732 template<> EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a)
    733 {
    734   const int8_t c[] = {0,1,2,3,4,5,6,7};
    735   return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
    736 }
    737 template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a)
    738 {
    739   const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
    740   return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
    741 }
    742 template<> EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a)
    743 { return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); }
    744 template<> EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a)
    745 {
    746   const uint8_t c[] = {0,1,2,3,4,5,6,7};
    747   return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
    748 }
    749 template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a)
    750 {
    751   const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
    752   return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
    753 }
    754 template<> EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a)
    755 {
    756   const int16_t c[] = {0,1,2,3};
    757   return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
    758 }
    759 template<> EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a)
    760 {
    761   const uint16_t c[] = {0,1,2,3};
    762   return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
    763 }
    764 template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a)
    765 {
    766   const int16_t c[] = {0,1,2,3,4,5,6,7};
    767   return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
    768 }
    769 template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a)
    770 {
    771   const uint16_t c[] = {0,1,2,3,4,5,6,7};
    772   return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
    773 }
    774 template<> EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a)
    775 {
    776   const int32_t c[] = {0,1};
    777   return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
    778 }
    779 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
    780 {
    781   const int32_t c[] = {0,1,2,3};
    782   return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
    783 }
    784 template<> EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a)
    785 {
    786   const uint32_t c[] = {0,1};
    787   return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
    788 }
    789 template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a)
    790 {
    791   const uint32_t c[] = {0,1,2,3};
    792   return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
    793 }
    794 template<> EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a)
    795 {
    796   const int64_t c[] = {0,1};
    797   return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
    798 }
    799 template<> EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a)
    800 {
    801   const uint64_t c[] = {0,1};
    802   return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
    803 }
    804 
    805 template<> EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); }
    806 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
    807 template<> EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b)
    808 {
    809   return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(
    810       vreinterpret_s8_s32(vdup_n_s32(a)),
    811       vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
    812 }
    813 template<> EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); }
    814 template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); }
    815 template<> EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
    816 {
    817   return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(
    818       vreinterpret_u8_u32(vdup_n_u32(a)),
    819       vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
    820 }
    821 template<> EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); }
    822 template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); }
    823 template<> EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); }
    824 template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); }
    825 template<> EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); }
    826 template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); }
    827 template<> EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); }
    828 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
    829 template<> EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); }
    830 template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); }
    831 template<> EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); }
    832 template<> EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); }
    833 
    834 template<> EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); }
    835 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
    836 template<> EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b)
    837 {
    838   return vget_lane_s32(vreinterpret_s32_s8(vsub_s8(
    839       vreinterpret_s8_s32(vdup_n_s32(a)),
    840       vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
    841 }
    842 template<> EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); }
    843 template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); }
    844 template<> EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
    845 {
    846   return vget_lane_u32(vreinterpret_u32_u8(vsub_u8(
    847       vreinterpret_u8_u32(vdup_n_u32(a)),
    848       vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
    849 }
    850 template<> EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); }
    851 template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); }
    852 template<> EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); }
    853 template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); }
    854 template<> EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); }
    855 template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); }
    856 template<> EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); }
    857 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
    858 template<> EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); }
    859 template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); }
    860 template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); }
    861 template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); }
    862 
    863 template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
    864 template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
    865   Packet2f mask = {numext::bit_cast<float>(0x80000000u), 0.0f};
    866   return padd(a, pxor(mask, b));
    867 }
    868 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
    869 template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
    870   Packet4f mask = {numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f};
    871   return padd(a, pxor(mask, b));
    872 }
    873 
    874 template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); }
    875 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
    876 template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a)
    877 { return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
    878 template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); }
    879 template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); }
    880 template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); }
    881 template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); }
    882 template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); }
    883 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
    884 template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
    885 #if EIGEN_ARCH_ARM64
    886   return vnegq_s64(a);
    887 #else
    888   return vcombine_s64(
    889       vdup_n_s64(-vgetq_lane_s64(a, 0)),
    890       vdup_n_s64(-vgetq_lane_s64(a, 1)));
    891 #endif
    892 }
    893 
    894 template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; }
    895 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
    896 template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; }
    897 template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; }
    898 template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; }
    899 template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; }
    900 template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; }
    901 template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; }
    902 template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; }
    903 template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; }
    904 template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; }
    905 template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; }
    906 template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; }
    907 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
    908 template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; }
    909 template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; }
    910 template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; }
    911 template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; }
    912 
    913 template<> EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); }
    914 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
    915 template<> EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b)
    916 {
    917   return vget_lane_s32(vreinterpret_s32_s8(vmul_s8(
    918       vreinterpret_s8_s32(vdup_n_s32(a)),
    919       vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
    920 }
    921 template<> EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); }
    922 template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); }
    923 template<> EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
    924 {
    925   return vget_lane_u32(vreinterpret_u32_u8(vmul_u8(
    926       vreinterpret_u8_u32(vdup_n_u32(a)),
    927       vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
    928 }
    929 template<> EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); }
    930 template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); }
    931 template<> EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); }
    932 template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); }
    933 template<> EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); }
    934 template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); }
    935 template<> EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); }
    936 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
    937 template<> EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); }
    938 template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); }
    939 template<> EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
    940   return vcombine_s64(
    941     vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)),
    942     vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1)));
    943 }
    944 template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
    945   return vcombine_u64(
    946     vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)),
    947     vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
    948 }
    949 
    950 template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b)
    951 {
    952 #if EIGEN_ARCH_ARM64
    953   return vdiv_f32(a,b);
    954 #else
    955   Packet2f inv, restep, div;
    956 
    957   // NEON does not offer a divide instruction, we have to do a reciprocal approximation
    958   // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
    959   // a reciprocal estimate AND a reciprocal step -which saves a few instructions
    960   // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
    961   // Newton-Raphson and vrecpsq_f32()
    962   inv = vrecpe_f32(b);
    963 
    964   // This returns a differential, by which we will have to multiply inv to get a better
    965   // approximation of 1/b.
    966   restep = vrecps_f32(b, inv);
    967   inv = vmul_f32(restep, inv);
    968 
    969   // Finally, multiply a by 1/b and get the wanted result of the division.
    970   div = vmul_f32(a, inv);
    971 
    972   return div;
    973 #endif
    974 }
    975 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
    976 {
    977 #if EIGEN_ARCH_ARM64
    978   return vdivq_f32(a,b);
    979 #else
    980   Packet4f inv, restep, div;
    981 
    982   // NEON does not offer a divide instruction, we have to do a reciprocal approximation
    983   // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
    984   // a reciprocal estimate AND a reciprocal step -which saves a few instructions
    985   // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
    986   // Newton-Raphson and vrecpsq_f32()
    987   inv = vrecpeq_f32(b);
    988 
    989   // This returns a differential, by which we will have to multiply inv to get a better
    990   // approximation of 1/b.
    991   restep = vrecpsq_f32(b, inv);
    992   inv = vmulq_f32(restep, inv);
    993 
    994   // Finally, multiply a by 1/b and get the wanted result of the division.
    995   div = vmulq_f32(a, inv);
    996 
    997   return div;
    998 #endif
    999 }
   1000 
   1001 template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
   1002 {
   1003   eigen_assert(false && "packet integer division are not supported by NEON");
   1004   return pset1<Packet4c>(0);
   1005 }
   1006 template<> EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/)
   1007 {
   1008   eigen_assert(false && "packet integer division are not supported by NEON");
   1009   return pset1<Packet8c>(0);
   1010 }
   1011 template<> EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/)
   1012 {
   1013   eigen_assert(false && "packet integer division are not supported by NEON");
   1014   return pset1<Packet16c>(0);
   1015 }
   1016 template<> EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/)
   1017 {
   1018   eigen_assert(false && "packet integer division are not supported by NEON");
   1019   return pset1<Packet4uc>(0);
   1020 }
   1021 template<> EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/)
   1022 {
   1023   eigen_assert(false && "packet integer division are not supported by NEON");
   1024   return pset1<Packet8uc>(0);
   1025 }
   1026 template<> EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/)
   1027 {
   1028   eigen_assert(false && "packet integer division are not supported by NEON");
   1029   return pset1<Packet16uc>(0);
   1030 }
   1031 template<> EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/)
   1032 {
   1033   eigen_assert(false && "packet integer division are not supported by NEON");
   1034   return pset1<Packet4s>(0);
   1035 }
   1036 template<> EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/)
   1037 {
   1038   eigen_assert(false && "packet integer division are not supported by NEON");
   1039   return pset1<Packet8s>(0);
   1040 }
   1041 template<> EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/)
   1042 {
   1043   eigen_assert(false && "packet integer division are not supported by NEON");
   1044   return pset1<Packet4us>(0);
   1045 }
   1046 template<> EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/)
   1047 {
   1048   eigen_assert(false && "packet integer division are not supported by NEON");
   1049   return pset1<Packet8us>(0);
   1050 }
   1051 template<> EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/)
   1052 {
   1053   eigen_assert(false && "packet integer division are not supported by NEON");
   1054   return pset1<Packet2i>(0);
   1055 }
   1056 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
   1057 {
   1058   eigen_assert(false && "packet integer division are not supported by NEON");
   1059   return pset1<Packet4i>(0);
   1060 }
   1061 template<> EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/)
   1062 {
   1063   eigen_assert(false && "packet integer division are not supported by NEON");
   1064   return pset1<Packet2ui>(0);
   1065 }
   1066 template<> EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/)
   1067 {
   1068   eigen_assert(false && "packet integer division are not supported by NEON");
   1069   return pset1<Packet4ui>(0);
   1070 }
   1071 template<> EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/)
   1072 {
   1073   eigen_assert(false && "packet integer division are not supported by NEON");
   1074   return pset1<Packet2l>(0LL);
   1075 }
   1076 template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/)
   1077 {
   1078   eigen_assert(false && "packet integer division are not supported by NEON");
   1079   return pset1<Packet2ul>(0ULL);
   1080 }
   1081 
   1082 
   1083 #ifdef __ARM_FEATURE_FMA
   1084 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
   1085 { return vfmaq_f32(c,a,b); }
   1086 template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
   1087 { return vfma_f32(c,a,b); }
   1088 #else
   1089 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
   1090 {
   1091   return vmlaq_f32(c,a,b);
   1092 }
   1093 template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
   1094 {
   1095   return vmla_f32(c,a,b);
   1096 }
   1097 #endif
   1098 
   1099 // No FMA instruction for int, so use MLA unconditionally.
   1100 template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c)
   1101 {
   1102   return vget_lane_s32(vreinterpret_s32_s8(vmla_s8(
   1103       vreinterpret_s8_s32(vdup_n_s32(c)),
   1104       vreinterpret_s8_s32(vdup_n_s32(a)),
   1105       vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
   1106 }
   1107 template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c)
   1108 { return vmla_s8(c,a,b); }
   1109 template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c)
   1110 { return vmlaq_s8(c,a,b); }
   1111 template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c)
   1112 {
   1113   return vget_lane_u32(vreinterpret_u32_u8(vmla_u8(
   1114       vreinterpret_u8_u32(vdup_n_u32(c)),
   1115       vreinterpret_u8_u32(vdup_n_u32(a)),
   1116       vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
   1117 }
   1118 template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c)
   1119 { return vmla_u8(c,a,b); }
   1120 template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c)
   1121 { return vmlaq_u8(c,a,b); }
   1122 template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c)
   1123 { return vmla_s16(c,a,b); }
   1124 template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c)
   1125 { return vmlaq_s16(c,a,b); }
   1126 template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c)
   1127 { return vmla_u16(c,a,b); }
   1128 template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c)
   1129 { return vmlaq_u16(c,a,b); }
   1130 template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c)
   1131 { return vmla_s32(c,a,b); }
   1132 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c)
   1133 { return vmlaq_s32(c,a,b); }
   1134 template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c)
   1135 { return vmla_u32(c,a,b); }
   1136 template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c)
   1137 { return vmlaq_u32(c,a,b); }
   1138 
   1139 template<> EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b)
   1140 { return vabd_f32(a,b); }
   1141 template<> EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b)
   1142 { return vabdq_f32(a,b); }
   1143 template<> EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b)
   1144 {
   1145   return vget_lane_s32(vreinterpret_s32_s8(vabd_s8(
   1146       vreinterpret_s8_s32(vdup_n_s32(a)),
   1147       vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
   1148 }
   1149 template<> EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b)
   1150 { return vabd_s8(a,b); }
   1151 template<> EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b)
   1152 { return vabdq_s8(a,b); }
   1153 template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
   1154 {
   1155   return vget_lane_u32(vreinterpret_u32_u8(vabd_u8(
   1156       vreinterpret_u8_u32(vdup_n_u32(a)),
   1157       vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
   1158 }
   1159 template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
   1160 { return vabd_u8(a,b); }
   1161 template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
   1162 { return vabdq_u8(a,b); }
   1163 template<> EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b)
   1164 { return vabd_s16(a,b); }
   1165 template<> EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b)
   1166 { return vabdq_s16(a,b); }
   1167 template<> EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b)
   1168 { return vabd_u16(a,b); }
   1169 template<> EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b)
   1170 { return vabdq_u16(a,b); }
   1171 template<> EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b)
   1172 { return vabd_s32(a,b); }
   1173 template<> EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b)
   1174 { return vabdq_s32(a,b); }
   1175 template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
   1176 { return vabd_u32(a,b); }
   1177 template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
   1178 { return vabdq_u32(a,b); }
   1179 
   1180 template<> EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); }
   1181 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
   1182 
   1183 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
   1184 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
   1185 template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); }
   1186 template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); }
   1187 #endif
   1188 
   1189 template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmin<Packet4f>(a, b); }
   1190 
   1191 template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmin<Packet2f>(a, b); }
   1192 
   1193 template<> EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b)
   1194 {
   1195   return vget_lane_s32(vreinterpret_s32_s8(vmin_s8(
   1196       vreinterpret_s8_s32(vdup_n_s32(a)),
   1197       vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
   1198 }
   1199 template<> EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); }
   1200 template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); }
   1201 template<> EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
   1202 {
   1203   return vget_lane_u32(vreinterpret_u32_u8(vmin_u8(
   1204       vreinterpret_u8_u32(vdup_n_u32(a)),
   1205       vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
   1206 }
   1207 template<> EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); }
   1208 template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); }
   1209 template<> EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); }
   1210 template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); }
   1211 template<> EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); }
   1212 template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); }
   1213 template<> EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); }
   1214 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
   1215 template<> EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); }
   1216 template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); }
   1217 template<> EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
   1218   return vcombine_s64(
   1219       vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
   1220       vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
   1221 }
   1222 template<> EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
   1223   return vcombine_u64(
   1224       vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
   1225       vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
   1226 }
   1227 
   1228 template<> EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); }
   1229 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
   1230 
   1231 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
   1232 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
   1233 template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); }
   1234 template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); }
   1235 #endif
   1236 
   1237 template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmax<Packet4f>(a, b); }
   1238 
   1239 template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmax<Packet2f>(a, b); }
   1240 
   1241 template<> EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b)
   1242 {
   1243   return vget_lane_s32(vreinterpret_s32_s8(vmax_s8(
   1244       vreinterpret_s8_s32(vdup_n_s32(a)),
   1245       vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
   1246 }
   1247 template<> EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); }
   1248 template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); }
   1249 template<> EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
   1250 {
   1251   return vget_lane_u32(vreinterpret_u32_u8(vmax_u8(
   1252       vreinterpret_u8_u32(vdup_n_u32(a)),
   1253       vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
   1254 }
   1255 template<> EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); }
   1256 template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); }
   1257 template<> EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); }
   1258 template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); }
   1259 template<> EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); }
   1260 template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); }
   1261 template<> EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); }
   1262 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
   1263 template<> EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); }
   1264 template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); }
   1265 template<> EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
   1266   return vcombine_s64(
   1267       vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
   1268       vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
   1269 }
   1270 template<> EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
   1271   return vcombine_u64(
   1272       vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
   1273       vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
   1274 }
   1275 
   1276 template<> EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b)
   1277 { return vreinterpret_f32_u32(vcle_f32(a,b)); }
   1278 template<> EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
   1279 { return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
   1280 template<> EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b)
   1281 {
   1282   return vget_lane_s32(vreinterpret_s32_u8(vcle_s8(
   1283       vreinterpret_s8_s32(vdup_n_s32(a)),
   1284       vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
   1285 }
   1286 template<> EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b)
   1287 { return vreinterpret_s8_u8(vcle_s8(a,b)); }
   1288 template<> EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b)
   1289 { return vreinterpretq_s8_u8(vcleq_s8(a,b)); }
   1290 template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
   1291 {
   1292   return vget_lane_u32(vreinterpret_u32_u8(vcle_u8(
   1293       vreinterpret_u8_u32(vdup_n_u32(a)),
   1294       vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
   1295 }
   1296 template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
   1297 { return vcle_u8(a,b); }
   1298 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
   1299 { return vcleq_u8(a,b); }
   1300 template<> EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b)
   1301 { return vreinterpret_s16_u16(vcle_s16(a,b)); }
   1302 template<> EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b)
   1303 { return vreinterpretq_s16_u16(vcleq_s16(a,b)); }
   1304 template<> EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b)
   1305 { return vcle_u16(a,b); }
   1306 template<> EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b)
   1307 { return vcleq_u16(a,b); }
   1308 template<> EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b)
   1309 { return vreinterpret_s32_u32(vcle_s32(a,b)); }
   1310 template<> EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b)
   1311 { return vreinterpretq_s32_u32(vcleq_s32(a,b)); }
   1312 template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
   1313 { return vcle_u32(a,b); }
   1314 template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
   1315 { return vcleq_u32(a,b); }
   1316 template<> EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b)
   1317 {
   1318 #if EIGEN_ARCH_ARM64
   1319   return vreinterpretq_s64_u64(vcleq_s64(a,b));
   1320 #else
   1321   return vcombine_s64(
   1322       vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
   1323       vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
   1324 #endif
   1325 }
   1326 template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
   1327 {
   1328 #if EIGEN_ARCH_ARM64
   1329   return vcleq_u64(a,b);
   1330 #else
   1331   return vcombine_u64(
   1332       vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
   1333       vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
   1334 #endif
   1335 }
   1336 
   1337 template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b)
   1338 { return vreinterpret_f32_u32(vclt_f32(a,b)); }
   1339 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
   1340 { return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
   1341 template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b)
   1342 {
   1343   return vget_lane_s32(vreinterpret_s32_u8(vclt_s8(
   1344       vreinterpret_s8_s32(vdup_n_s32(a)),
   1345       vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
   1346 }
   1347 template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b)
   1348 { return vreinterpret_s8_u8(vclt_s8(a,b)); }
   1349 template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b)
   1350 { return vreinterpretq_s8_u8(vcltq_s8(a,b)); }
   1351 template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
   1352 {
   1353   return vget_lane_u32(vreinterpret_u32_u8(vclt_u8(
   1354       vreinterpret_u8_u32(vdup_n_u32(a)),
   1355       vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
   1356 }
   1357 template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
   1358 { return vclt_u8(a,b); }
   1359 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
   1360 { return vcltq_u8(a,b); }
   1361 template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b)
   1362 { return vreinterpret_s16_u16(vclt_s16(a,b)); }
   1363 template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b)
   1364 { return vreinterpretq_s16_u16(vcltq_s16(a,b)); }
   1365 template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b)
   1366 { return vclt_u16(a,b); }
   1367 template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b)
   1368 { return vcltq_u16(a,b); }
   1369 template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b)
   1370 { return vreinterpret_s32_u32(vclt_s32(a,b)); }
   1371 template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b)
   1372 { return vreinterpretq_s32_u32(vcltq_s32(a,b)); }
   1373 template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
   1374 { return vclt_u32(a,b); }
   1375 template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
   1376 { return vcltq_u32(a,b); }
   1377 template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b)
   1378 {
   1379 #if EIGEN_ARCH_ARM64
   1380   return vreinterpretq_s64_u64(vcltq_s64(a,b));
   1381 #else
   1382   return vcombine_s64(
   1383       vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
   1384       vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
   1385 #endif
   1386 }
   1387 template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
   1388 {
   1389 #if EIGEN_ARCH_ARM64
   1390   return vcltq_u64(a,b);
   1391 #else
   1392   return vcombine_u64(
   1393       vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
   1394       vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
   1395 #endif
   1396 }
   1397 
   1398 template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b)
   1399 { return vreinterpret_f32_u32(vceq_f32(a,b)); }
   1400 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
   1401 { return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
   1402 template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b)
   1403 {
   1404   return vget_lane_s32(vreinterpret_s32_u8(vceq_s8(
   1405       vreinterpret_s8_s32(vdup_n_s32(a)),
   1406       vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
   1407 }
   1408 template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b)
   1409 { return vreinterpret_s8_u8(vceq_s8(a,b)); }
   1410 template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b)
   1411 { return vreinterpretq_s8_u8(vceqq_s8(a,b)); }
   1412 template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
   1413 {
   1414   return vget_lane_u32(vreinterpret_u32_u8(vceq_u8(
   1415       vreinterpret_u8_u32(vdup_n_u32(a)),
   1416       vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
   1417 }
   1418 template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
   1419 { return vceq_u8(a,b); }
   1420 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
   1421 { return vceqq_u8(a,b); }
   1422 template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b)
   1423 { return vreinterpret_s16_u16(vceq_s16(a,b)); }
   1424 template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b)
   1425 { return vreinterpretq_s16_u16(vceqq_s16(a,b)); }
   1426 template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b)
   1427 { return vceq_u16(a,b); }
   1428 template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b)
   1429 { return vceqq_u16(a,b); }
   1430 template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b)
   1431 { return vreinterpret_s32_u32(vceq_s32(a,b)); }
   1432 template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b)
   1433 { return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
   1434 template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
   1435 { return vceq_u32(a,b); }
   1436 template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
   1437 { return vceqq_u32(a,b); }
   1438 template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b)
   1439 {
   1440 #if EIGEN_ARCH_ARM64
   1441   return vreinterpretq_s64_u64(vceqq_s64(a,b));
   1442 #else
   1443   return vcombine_s64(
   1444       vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
   1445       vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
   1446 #endif
   1447 }
   1448 template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
   1449 {
   1450 #if EIGEN_ARCH_ARM64
   1451   return vceqq_u64(a,b);
   1452 #else
   1453   return vcombine_u64(
   1454       vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
   1455       vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
   1456 #endif
   1457 }
   1458 
   1459 template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b)
   1460 { return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); }
   1461 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b)
   1462 { return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
   1463 
   1464 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
   1465 template<> EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b)
   1466 { return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
   1467 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
   1468 { return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
   1469 template<> EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b)
   1470 { return a & b; }
   1471 template<> EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b)
   1472 { return vand_s8(a,b); }
   1473 template<> EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b)
   1474 { return vandq_s8(a,b); }
   1475 template<> EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
   1476 { return a & b; }
   1477 template<> EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
   1478 { return vand_u8(a,b); }
   1479 template<> EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
   1480 { return vandq_u8(a,b); }
   1481 template<> EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); }
   1482 template<> EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); }
   1483 template<> EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b)
   1484 { return vand_u16(a,b); }
   1485 template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b)
   1486 { return vandq_u16(a,b); }
   1487 template<> EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); }
   1488 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
   1489 template<> EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
   1490 { return vand_u32(a,b); }
   1491 template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
   1492 { return vandq_u32(a,b); }
   1493 template<> EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); }
   1494 template<> EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
   1495 { return vandq_u64(a,b); }
   1496 
   1497 template<> EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b)
   1498 { return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
   1499 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
   1500 { return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
   1501 template<> EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b)
   1502 { return a | b; }
   1503 template<> EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); }
   1504 template<> EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b)
   1505 { return vorrq_s8(a,b); }
   1506 template<> EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
   1507 { return a | b; }
   1508 template<> EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
   1509 { return vorr_u8(a,b); }
   1510 template<> EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
   1511 { return vorrq_u8(a,b); }
   1512 template<> EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b)
   1513 { return vorr_s16(a,b); }
   1514 template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b)
   1515 { return vorrq_s16(a,b); }
   1516 template<> EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b)
   1517 { return vorr_u16(a,b); }
   1518 template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b)
   1519 { return vorrq_u16(a,b); }
   1520 template<> EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); }
   1521 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
   1522 template<> EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
   1523 { return vorr_u32(a,b); }
   1524 template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
   1525 { return vorrq_u32(a,b); }
   1526 template<> EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b)
   1527 { return vorrq_s64(a,b); }
   1528 template<> EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
   1529 { return vorrq_u64(a,b); }
   1530 
   1531 template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b)
   1532 { return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
   1533 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
   1534 { return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
   1535 template<> EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b)
   1536 { return a ^ b; }
   1537 template<> EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b)
   1538 { return veor_s8(a,b); }
   1539 template<> EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b)
   1540 { return veorq_s8(a,b); }
   1541 template<> EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
   1542 { return a ^ b; }
   1543 template<> EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
   1544 { return veor_u8(a,b); }
   1545 template<> EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
   1546 { return veorq_u8(a,b); }
   1547 template<> EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); }
   1548 template<> EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); }
   1549 template<> EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b)
   1550 { return veor_u16(a,b); }
   1551 template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b)
   1552 { return veorq_u16(a,b); }
   1553 template<> EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); }
   1554 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
   1555 template<> EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
   1556 { return veor_u32(a,b); }
   1557 template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
   1558 { return veorq_u32(a,b); }
   1559 template<> EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b)
   1560 { return veorq_s64(a,b); }
   1561 template<> EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
   1562 { return veorq_u64(a,b); }
   1563 
   1564 template<> EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b)
   1565 { return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
   1566 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
   1567 { return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
   1568 template<> EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b)
   1569 { return a & ~b; }
   1570 template<> EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); }
   1571 template<> EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); }
   1572 template<> EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
   1573 { return a & ~b; }
   1574 template<> EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
   1575 { return vbic_u8(a,b); }
   1576 template<> EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
   1577 { return vbicq_u8(a,b); }
   1578 template<> EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b)
   1579 { return vbic_s16(a,b); }
   1580 template<> EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b)
   1581 { return vbicq_s16(a,b); }
   1582 template<> EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b)
   1583 { return vbic_u16(a,b); }
   1584 template<> EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b)
   1585 { return vbicq_u16(a,b); }
   1586 template<> EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b)
   1587 { return vbic_s32(a,b); }
   1588 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b)
   1589 { return vbicq_s32(a,b); }
   1590 template<> EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
   1591 { return vbic_u32(a,b); }
   1592 template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
   1593 { return vbicq_u32(a,b); }
   1594 template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b)
   1595 { return vbicq_s64(a,b); }
   1596 template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
   1597 { return vbicq_u64(a,b); }
   1598 
   1599 
   1600 template<int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
   1601 { return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
   1602 template<int N> EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); }
   1603 template<int N> EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); }
   1604 template<int N> EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a)
   1605 { return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
   1606 template<int N> EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
   1607 template<int N> EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
   1608 template<int N> EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); }
   1609 template<int N> EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); }
   1610 template<int N> EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
   1611 template<int N> EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
   1612 template<int N> EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); }
   1613 template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); }
   1614 template<int N> EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
   1615 template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
   1616 template<int N> EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); }
   1617 template<int N> EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
   1618 
   1619 template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a)
   1620 { return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); }
   1621 template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a)
   1622 { return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); }
   1623 template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a)
   1624 { return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); }
   1625 template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a)
   1626 { return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); }
   1627 template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
   1628 template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
   1629 template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a)
   1630 { return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); }
   1631 template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a)
   1632 { return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); }
   1633 template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
   1634 template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
   1635 template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a)
   1636 { return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); }
   1637 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
   1638 { return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); }
   1639 template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
   1640 template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
   1641 template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a)
   1642 { return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); }
   1643 template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
   1644 
   1645 template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a)
   1646 { return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
   1647 template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); }
   1648 template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); }
   1649 template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a)
   1650 { return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
   1651 template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); }
   1652 template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); }
   1653 template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); }
   1654 template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); }
   1655 template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); }
   1656 template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); }
   1657 template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); }
   1658 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); }
   1659 template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); }
   1660 template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); }
   1661 template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); }
   1662 template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); }
   1663 
   1664 template<> EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from)
   1665 { EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); }
   1666 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
   1667 { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
   1668 template<> EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from)
   1669 {
   1670   Packet4c res;
   1671   memcpy(&res, from, sizeof(Packet4c));
   1672   return res;
   1673 }
   1674 template<> EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from)
   1675 { EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); }
   1676 template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from)
   1677 { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); }
   1678 template<> EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from)
   1679 {
   1680   Packet4uc res;
   1681   memcpy(&res, from, sizeof(Packet4uc));
   1682   return res;
   1683 }
   1684 template<> EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from)
   1685 { EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); }
   1686 template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from)
   1687 { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); }
   1688 template<> EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from)
   1689 { EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); }
   1690 template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from)
   1691 { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); }
   1692 template<> EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from)
   1693 { EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); }
   1694 template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from)
   1695 { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); }
   1696 template<> EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from)
   1697 { EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); }
   1698 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from)
   1699 { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
   1700 template<> EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from)
   1701 { EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); }
   1702 template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from)
   1703 { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); }
   1704 template<> EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from)
   1705 { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); }
   1706 template<> EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from)
   1707 { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); }
   1708 
   1709 template<> EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from)
   1710 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); }
   1711 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
   1712 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
   1713 template<> EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from)
   1714 {
   1715   Packet4c res;
   1716   memcpy(&res, from, sizeof(Packet4c));
   1717   return res;
   1718 }
   1719 template<> EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from)
   1720 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); }
   1721 template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from)
   1722 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); }
   1723 template<> EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from)
   1724 {
   1725   Packet4uc res;
   1726   memcpy(&res, from, sizeof(Packet4uc));
   1727   return res;
   1728 }
   1729 template<> EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from)
   1730 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); }
   1731 template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from)
   1732 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); }
   1733 template<> EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from)
   1734 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); }
   1735 template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from)
   1736 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); }
   1737 template<> EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from)
   1738 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); }
   1739 template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from)
   1740 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); }
   1741 template<> EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from)
   1742 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); }
   1743 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from)
   1744 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
   1745 template<> EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from)
   1746 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); }
   1747 template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
   1748 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); }
   1749 template<> EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from)
   1750 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); }
   1751 template<> EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from)
   1752 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); }
   1753 
   1754 template<> EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from)
   1755 { return vld1_dup_f32(from); }
   1756 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
   1757 { return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); }
   1758 template<> EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from)
   1759 {
   1760   const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
   1761   return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0);
   1762 }
   1763 template<> EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from)
   1764 {
   1765   const int8x8_t a = vld1_s8(from);
   1766   return vzip_s8(a,a).val[0];
   1767 }
   1768 template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from)
   1769 {
   1770   const int8x8_t a = vld1_s8(from);
   1771   const int8x8x2_t b = vzip_s8(a,a);
   1772   return vcombine_s8(b.val[0], b.val[1]);
   1773 }
   1774 template<> EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from)
   1775 {
   1776   const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
   1777   return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0);
   1778 }
   1779 template<> EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from)
   1780 {
   1781   const uint8x8_t a = vld1_u8(from);
   1782   return vzip_u8(a,a).val[0];
   1783 }
   1784 template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from)
   1785 {
   1786   const uint8x8_t a = vld1_u8(from);
   1787   const uint8x8x2_t b = vzip_u8(a,a);
   1788   return vcombine_u8(b.val[0], b.val[1]);
   1789 }
   1790 template<> EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from)
   1791 {
   1792   return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)),
   1793       vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]);
   1794 }
   1795 template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from)
   1796 {
   1797   const int16x4_t a = vld1_s16(from);
   1798   const int16x4x2_t b = vzip_s16(a,a);
   1799   return vcombine_s16(b.val[0], b.val[1]);
   1800 }
   1801 template<> EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from)
   1802 {
   1803   return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)),
   1804       vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]);
   1805 }
   1806 template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from)
   1807 {
   1808   const uint16x4_t a = vld1_u16(from);
   1809   const uint16x4x2_t b = vzip_u16(a,a);
   1810   return vcombine_u16(b.val[0], b.val[1]);
   1811 }
   1812 template<> EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from)
   1813 { return vld1_dup_s32(from); }
   1814 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
   1815 { return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); }
   1816 template<> EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from)
   1817 { return vld1_dup_u32(from); }
   1818 template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
   1819 { return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); }
   1820 template<> EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from)
   1821 { return vld1q_dup_s64(from); }
   1822 template<> EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from)
   1823 { return vld1q_dup_u64(from); }
   1824 
   1825 template<> EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) { return vld1q_dup_f32(from); }
   1826 template<> EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from)
   1827 { return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); }
   1828 template<> EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from)
   1829 {
   1830   return vreinterpret_s8_u32(vzip_u32(
   1831       vreinterpret_u32_s8(vld1_dup_s8(from)),
   1832       vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
   1833 }
   1834 template<> EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from)
   1835 {
   1836   const int8x8_t a = vreinterpret_s8_u32(vzip_u32(
   1837       vreinterpret_u32_s8(vld1_dup_s8(from)),
   1838       vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
   1839   const int8x8_t b = vreinterpret_s8_u32(vzip_u32(
   1840       vreinterpret_u32_s8(vld1_dup_s8(from+2)),
   1841       vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]);
   1842   return vcombine_s8(a,b);
   1843 }
   1844 template<> EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from)
   1845 { return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); }
   1846 template<> EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from)
   1847 {
   1848   return vreinterpret_u8_u32(vzip_u32(
   1849       vreinterpret_u32_u8(vld1_dup_u8(from)),
   1850       vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
   1851 }
   1852 template<> EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from)
   1853 {
   1854   const uint8x8_t a = vreinterpret_u8_u32(vzip_u32(
   1855       vreinterpret_u32_u8(vld1_dup_u8(from)),
   1856       vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
   1857   const uint8x8_t b = vreinterpret_u8_u32(vzip_u32(
   1858       vreinterpret_u32_u8(vld1_dup_u8(from+2)),
   1859       vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]);
   1860   return vcombine_u8(a,b);
   1861 }
   1862 template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from)
   1863 { return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); }
   1864 template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from)
   1865 { return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); }
   1866 template<> EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) { return vld1q_dup_s32(from); }
   1867 template<> EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) { return vld1q_dup_u32(from); }
   1868 
   1869 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from)
   1870 { EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); }
   1871 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
   1872 { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); }
   1873 template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from)
   1874 { memcpy(to, &from, sizeof(from)); }
   1875 template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from)
   1876 { EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); }
   1877 template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from)
   1878 { EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); }
   1879 template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from)
   1880 { memcpy(to, &from, sizeof(from)); }
   1881 template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from)
   1882 { EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); }
   1883 template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from)
   1884 { EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); }
   1885 template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from)
   1886 { EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); }
   1887 template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from)
   1888 { EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); }
   1889 template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from)
   1890 { EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); }
   1891 template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from)
   1892 { EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); }
   1893 template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from)
   1894 { EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); }
   1895 template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from)
   1896 { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); }
   1897 template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from)
   1898 { EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); }
   1899 template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from)
   1900 { EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); }
   1901 template<> EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from)
   1902 { EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); }
   1903 template<> EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from)
   1904 { EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); }
   1905 
   1906 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from)
   1907 { EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); }
   1908 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
   1909 { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); }
   1910 template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from)
   1911 { memcpy(to, &from, sizeof(from)); }
   1912 template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from)
   1913 { EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); }
   1914 template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from)
   1915 { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); }
   1916 template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from)
   1917 { memcpy(to, &from, sizeof(from)); }
   1918 template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from)
   1919 { EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); }
   1920 template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from)
   1921 { EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); }
   1922 template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from)
   1923 { EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); }
   1924 template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from)
   1925 { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); }
   1926 template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from)
   1927 { EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); }
   1928 template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from)
   1929 { EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); }
   1930 template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from)
   1931 { EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); }
   1932 template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from)
   1933 { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); }
   1934 template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from)
   1935 { EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); }
   1936 template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from)
   1937 { EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); }
   1938 template<> EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from)
   1939 { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); }
   1940 template<> EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from)
   1941 { EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); }
   1942 
   1943 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride)
   1944 {
   1945   Packet2f res = vld1_dup_f32(from);
   1946   res = vld1_lane_f32(from + 1*stride, res, 1);
   1947   return res;
   1948 }
   1949 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
   1950 {
   1951   Packet4f res = vld1q_dup_f32(from);
   1952   res = vld1q_lane_f32(from + 1*stride, res, 1);
   1953   res = vld1q_lane_f32(from + 2*stride, res, 2);
   1954   res = vld1q_lane_f32(from + 3*stride, res, 3);
   1955   return res;
   1956 }
   1957 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride)
   1958 {
   1959   Packet4c res;
   1960   for (int i = 0; i != 4; i++)
   1961     reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
   1962   return res;
   1963 }
   1964 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride)
   1965 {
   1966   Packet8c res = vld1_dup_s8(from);
   1967   res = vld1_lane_s8(from + 1*stride, res, 1);
   1968   res = vld1_lane_s8(from + 2*stride, res, 2);
   1969   res = vld1_lane_s8(from + 3*stride, res, 3);
   1970   res = vld1_lane_s8(from + 4*stride, res, 4);
   1971   res = vld1_lane_s8(from + 5*stride, res, 5);
   1972   res = vld1_lane_s8(from + 6*stride, res, 6);
   1973   res = vld1_lane_s8(from + 7*stride, res, 7);
   1974   return res;
   1975 }
   1976 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
   1977 {
   1978   Packet16c res = vld1q_dup_s8(from);
   1979   res = vld1q_lane_s8(from + 1*stride, res, 1);
   1980   res = vld1q_lane_s8(from + 2*stride, res, 2);
   1981   res = vld1q_lane_s8(from + 3*stride, res, 3);
   1982   res = vld1q_lane_s8(from + 4*stride, res, 4);
   1983   res = vld1q_lane_s8(from + 5*stride, res, 5);
   1984   res = vld1q_lane_s8(from + 6*stride, res, 6);
   1985   res = vld1q_lane_s8(from + 7*stride, res, 7);
   1986   res = vld1q_lane_s8(from + 8*stride, res, 8);
   1987   res = vld1q_lane_s8(from + 9*stride, res, 9);
   1988   res = vld1q_lane_s8(from + 10*stride, res, 10);
   1989   res = vld1q_lane_s8(from + 11*stride, res, 11);
   1990   res = vld1q_lane_s8(from + 12*stride, res, 12);
   1991   res = vld1q_lane_s8(from + 13*stride, res, 13);
   1992   res = vld1q_lane_s8(from + 14*stride, res, 14);
   1993   res = vld1q_lane_s8(from + 15*stride, res, 15);
   1994   return res;
   1995 }
   1996 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride)
   1997 {
   1998   Packet4uc res;
   1999   for (int i = 0; i != 4; i++)
   2000     reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
   2001   return res;
   2002 }
   2003 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride)
   2004 {
   2005   Packet8uc res = vld1_dup_u8(from);
   2006   res = vld1_lane_u8(from + 1*stride, res, 1);
   2007   res = vld1_lane_u8(from + 2*stride, res, 2);
   2008   res = vld1_lane_u8(from + 3*stride, res, 3);
   2009   res = vld1_lane_u8(from + 4*stride, res, 4);
   2010   res = vld1_lane_u8(from + 5*stride, res, 5);
   2011   res = vld1_lane_u8(from + 6*stride, res, 6);
   2012   res = vld1_lane_u8(from + 7*stride, res, 7);
   2013   return res;
   2014 }
   2015 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
   2016 {
   2017   Packet16uc res = vld1q_dup_u8(from);
   2018   res = vld1q_lane_u8(from + 1*stride, res, 1);
   2019   res = vld1q_lane_u8(from + 2*stride, res, 2);
   2020   res = vld1q_lane_u8(from + 3*stride, res, 3);
   2021   res = vld1q_lane_u8(from + 4*stride, res, 4);
   2022   res = vld1q_lane_u8(from + 5*stride, res, 5);
   2023   res = vld1q_lane_u8(from + 6*stride, res, 6);
   2024   res = vld1q_lane_u8(from + 7*stride, res, 7);
   2025   res = vld1q_lane_u8(from + 8*stride, res, 8);
   2026   res = vld1q_lane_u8(from + 9*stride, res, 9);
   2027   res = vld1q_lane_u8(from + 10*stride, res, 10);
   2028   res = vld1q_lane_u8(from + 11*stride, res, 11);
   2029   res = vld1q_lane_u8(from + 12*stride, res, 12);
   2030   res = vld1q_lane_u8(from + 13*stride, res, 13);
   2031   res = vld1q_lane_u8(from + 14*stride, res, 14);
   2032   res = vld1q_lane_u8(from + 15*stride, res, 15);
   2033   return res;
   2034 }
   2035 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride)
   2036 {
   2037   Packet4s res = vld1_dup_s16(from);
   2038   res = vld1_lane_s16(from + 1*stride, res, 1);
   2039   res = vld1_lane_s16(from + 2*stride, res, 2);
   2040   res = vld1_lane_s16(from + 3*stride, res, 3);
   2041   return res;
   2042 }
   2043 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride)
   2044 {
   2045   Packet8s res = vld1q_dup_s16(from);
   2046   res = vld1q_lane_s16(from + 1*stride, res, 1);
   2047   res = vld1q_lane_s16(from + 2*stride, res, 2);
   2048   res = vld1q_lane_s16(from + 3*stride, res, 3);
   2049   res = vld1q_lane_s16(from + 4*stride, res, 4);
   2050   res = vld1q_lane_s16(from + 5*stride, res, 5);
   2051   res = vld1q_lane_s16(from + 6*stride, res, 6);
   2052   res = vld1q_lane_s16(from + 7*stride, res, 7);
   2053   return res;
   2054 }
   2055 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride)
   2056 {
   2057   Packet4us res = vld1_dup_u16(from);
   2058   res = vld1_lane_u16(from + 1*stride, res, 1);
   2059   res = vld1_lane_u16(from + 2*stride, res, 2);
   2060   res = vld1_lane_u16(from + 3*stride, res, 3);
   2061   return res;
   2062 }
   2063 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride)
   2064 {
   2065   Packet8us res = vld1q_dup_u16(from);
   2066   res = vld1q_lane_u16(from + 1*stride, res, 1);
   2067   res = vld1q_lane_u16(from + 2*stride, res, 2);
   2068   res = vld1q_lane_u16(from + 3*stride, res, 3);
   2069   res = vld1q_lane_u16(from + 4*stride, res, 4);
   2070   res = vld1q_lane_u16(from + 5*stride, res, 5);
   2071   res = vld1q_lane_u16(from + 6*stride, res, 6);
   2072   res = vld1q_lane_u16(from + 7*stride, res, 7);
   2073   return res;
   2074 }
   2075 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride)
   2076 {
   2077   Packet2i res = vld1_dup_s32(from);
   2078   res = vld1_lane_s32(from + 1*stride, res, 1);
   2079   return res;
   2080 }
   2081 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
   2082 {
   2083   Packet4i res = vld1q_dup_s32(from);
   2084   res = vld1q_lane_s32(from + 1*stride, res, 1);
   2085   res = vld1q_lane_s32(from + 2*stride, res, 2);
   2086   res = vld1q_lane_s32(from + 3*stride, res, 3);
   2087   return res;
   2088 }
   2089 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride)
   2090 {
   2091   Packet2ui res = vld1_dup_u32(from);
   2092   res = vld1_lane_u32(from + 1*stride, res, 1);
   2093   return res;
   2094 }
   2095 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
   2096 {
   2097   Packet4ui res = vld1q_dup_u32(from);
   2098   res = vld1q_lane_u32(from + 1*stride, res, 1);
   2099   res = vld1q_lane_u32(from + 2*stride, res, 2);
   2100   res = vld1q_lane_u32(from + 3*stride, res, 3);
   2101   return res;
   2102 }
   2103 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride)
   2104 {
   2105   Packet2l res = vld1q_dup_s64(from);
   2106   res = vld1q_lane_s64(from + 1*stride, res, 1);
   2107   return res;
   2108 }
   2109 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride)
   2110 {
   2111   Packet2ul res = vld1q_dup_u64(from);
   2112   res = vld1q_lane_u64(from + 1*stride, res, 1);
   2113   return res;
   2114 }
   2115 
   2116 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride)
   2117 {
   2118   vst1_lane_f32(to + stride*0, from, 0);
   2119   vst1_lane_f32(to + stride*1, from, 1);
   2120 }
   2121 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
   2122 {
   2123   vst1q_lane_f32(to + stride*0, from, 0);
   2124   vst1q_lane_f32(to + stride*1, from, 1);
   2125   vst1q_lane_f32(to + stride*2, from, 2);
   2126   vst1q_lane_f32(to + stride*3, from, 3);
   2127 }
   2128 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride)
   2129 {
   2130   for (int i = 0; i != 4; i++)
   2131     *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
   2132 }
   2133 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride)
   2134 {
   2135   vst1_lane_s8(to + stride*0, from, 0);
   2136   vst1_lane_s8(to + stride*1, from, 1);
   2137   vst1_lane_s8(to + stride*2, from, 2);
   2138   vst1_lane_s8(to + stride*3, from, 3);
   2139   vst1_lane_s8(to + stride*4, from, 4);
   2140   vst1_lane_s8(to + stride*5, from, 5);
   2141   vst1_lane_s8(to + stride*6, from, 6);
   2142   vst1_lane_s8(to + stride*7, from, 7);
   2143 }
   2144 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, Index stride)
   2145 {
   2146   vst1q_lane_s8(to + stride*0, from, 0);
   2147   vst1q_lane_s8(to + stride*1, from, 1);
   2148   vst1q_lane_s8(to + stride*2, from, 2);
   2149   vst1q_lane_s8(to + stride*3, from, 3);
   2150   vst1q_lane_s8(to + stride*4, from, 4);
   2151   vst1q_lane_s8(to + stride*5, from, 5);
   2152   vst1q_lane_s8(to + stride*6, from, 6);
   2153   vst1q_lane_s8(to + stride*7, from, 7);
   2154   vst1q_lane_s8(to + stride*8, from, 8);
   2155   vst1q_lane_s8(to + stride*9, from, 9);
   2156   vst1q_lane_s8(to + stride*10, from, 10);
   2157   vst1q_lane_s8(to + stride*11, from, 11);
   2158   vst1q_lane_s8(to + stride*12, from, 12);
   2159   vst1q_lane_s8(to + stride*13, from, 13);
   2160   vst1q_lane_s8(to + stride*14, from, 14);
   2161   vst1q_lane_s8(to + stride*15, from, 15);
   2162 }
   2163 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from, Index stride)
   2164 {
   2165   for (int i = 0; i != 4; i++)
   2166     *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
   2167 }
   2168 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from, Index stride)
   2169 {
   2170   vst1_lane_u8(to + stride*0, from, 0);
   2171   vst1_lane_u8(to + stride*1, from, 1);
   2172   vst1_lane_u8(to + stride*2, from, 2);
   2173   vst1_lane_u8(to + stride*3, from, 3);
   2174   vst1_lane_u8(to + stride*4, from, 4);
   2175   vst1_lane_u8(to + stride*5, from, 5);
   2176   vst1_lane_u8(to + stride*6, from, 6);
   2177   vst1_lane_u8(to + stride*7, from, 7);
   2178 }
   2179 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, Index stride)
   2180 {
   2181   vst1q_lane_u8(to + stride*0, from, 0);
   2182   vst1q_lane_u8(to + stride*1, from, 1);
   2183   vst1q_lane_u8(to + stride*2, from, 2);
   2184   vst1q_lane_u8(to + stride*3, from, 3);
   2185   vst1q_lane_u8(to + stride*4, from, 4);
   2186   vst1q_lane_u8(to + stride*5, from, 5);
   2187   vst1q_lane_u8(to + stride*6, from, 6);
   2188   vst1q_lane_u8(to + stride*7, from, 7);
   2189   vst1q_lane_u8(to + stride*8, from, 8);
   2190   vst1q_lane_u8(to + stride*9, from, 9);
   2191   vst1q_lane_u8(to + stride*10, from, 10);
   2192   vst1q_lane_u8(to + stride*11, from, 11);
   2193   vst1q_lane_u8(to + stride*12, from, 12);
   2194   vst1q_lane_u8(to + stride*13, from, 13);
   2195   vst1q_lane_u8(to + stride*14, from, 14);
   2196   vst1q_lane_u8(to + stride*15, from, 15);
   2197 }
   2198 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from, Index stride)
   2199 {
   2200   vst1_lane_s16(to + stride*0, from, 0);
   2201   vst1_lane_s16(to + stride*1, from, 1);
   2202   vst1_lane_s16(to + stride*2, from, 2);
   2203   vst1_lane_s16(to + stride*3, from, 3);
   2204 }
   2205 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, Index stride)
   2206 {
   2207   vst1q_lane_s16(to + stride*0, from, 0);
   2208   vst1q_lane_s16(to + stride*1, from, 1);
   2209   vst1q_lane_s16(to + stride*2, from, 2);
   2210   vst1q_lane_s16(to + stride*3, from, 3);
   2211   vst1q_lane_s16(to + stride*4, from, 4);
   2212   vst1q_lane_s16(to + stride*5, from, 5);
   2213   vst1q_lane_s16(to + stride*6, from, 6);
   2214   vst1q_lane_s16(to + stride*7, from, 7);
   2215 }
   2216 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from, Index stride)
   2217 {
   2218   vst1_lane_u16(to + stride*0, from, 0);
   2219   vst1_lane_u16(to + stride*1, from, 1);
   2220   vst1_lane_u16(to + stride*2, from, 2);
   2221   vst1_lane_u16(to + stride*3, from, 3);
   2222 }
   2223 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, Index stride)
   2224 {
   2225   vst1q_lane_u16(to + stride*0, from, 0);
   2226   vst1q_lane_u16(to + stride*1, from, 1);
   2227   vst1q_lane_u16(to + stride*2, from, 2);
   2228   vst1q_lane_u16(to + stride*3, from, 3);
   2229   vst1q_lane_u16(to + stride*4, from, 4);
   2230   vst1q_lane_u16(to + stride*5, from, 5);
   2231   vst1q_lane_u16(to + stride*6, from, 6);
   2232   vst1q_lane_u16(to + stride*7, from, 7);
   2233 }
   2234 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from, Index stride)
   2235 {
   2236   vst1_lane_s32(to + stride*0, from, 0);
   2237   vst1_lane_s32(to + stride*1, from, 1);
   2238 }
   2239 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
   2240 {
   2241   vst1q_lane_s32(to + stride*0, from, 0);
   2242   vst1q_lane_s32(to + stride*1, from, 1);
   2243   vst1q_lane_s32(to + stride*2, from, 2);
   2244   vst1q_lane_s32(to + stride*3, from, 3);
   2245 }
   2246 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from, Index stride)
   2247 {
   2248   vst1_lane_u32(to + stride*0, from, 0);
   2249   vst1_lane_u32(to + stride*1, from, 1);
   2250 }
   2251 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
   2252 {
   2253   vst1q_lane_u32(to + stride*0, from, 0);
   2254   vst1q_lane_u32(to + stride*1, from, 1);
   2255   vst1q_lane_u32(to + stride*2, from, 2);
   2256   vst1q_lane_u32(to + stride*3, from, 3);
   2257 }
   2258 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride)
   2259 {
   2260   vst1q_lane_s64(to + stride*0, from, 0);
   2261   vst1q_lane_s64(to + stride*1, from, 1);
   2262 }
   2263 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, Index stride)
   2264 {
   2265   vst1q_lane_u64(to + stride*0, from, 0);
   2266   vst1q_lane_u64(to + stride*1, from, 1);
   2267 }
   2268 
   2269 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
   2270 template<> EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
   2271 template<> EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
   2272 template<> EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
   2273 template<> EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
   2274 template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
   2275 template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
   2276 template<> EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
   2277 template<> EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
   2278 
   2279 template<> EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) { return vget_lane_f32(a,0); }
   2280 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return vgetq_lane_f32(a,0); }
   2281 template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) { return static_cast<int8_t>(a & 0xff); }
   2282 template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) { return vget_lane_s8(a,0); }
   2283 template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) { return vgetq_lane_s8(a,0); }
   2284 template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) { return static_cast<uint8_t>(a & 0xff); }
   2285 template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) { return vget_lane_u8(a,0); }
   2286 template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) { return vgetq_lane_u8(a,0); }
   2287 template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) { return vget_lane_s16(a,0); }
   2288 template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) { return vgetq_lane_s16(a,0); }
   2289 template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) { return vget_lane_u16(a,0); }
   2290 template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) { return vgetq_lane_u16(a,0); }
   2291 template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) { return vget_lane_s32(a,0); }
   2292 template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { return vgetq_lane_s32(a,0); }
   2293 template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(a,0); }
   2294 template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return vgetq_lane_u32(a,0); }
   2295 template<> EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) { return vgetq_lane_s64(a,0); }
   2296 template<> EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) { return vgetq_lane_u64(a,0); }
   2297 
   2298 template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); }
   2299 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
   2300 {
   2301   const float32x4_t a_r64 = vrev64q_f32(a);
   2302   return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
   2303 }
   2304 template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a)
   2305 { return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
   2306 template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); }
   2307 template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
   2308 {
   2309   const int8x16_t a_r64 = vrev64q_s8(a);
   2310   return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
   2311 }
   2312 template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a)
   2313 { return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); }
   2314 template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); }
   2315 template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
   2316 {
   2317   const uint8x16_t a_r64 = vrev64q_u8(a);
   2318   return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
   2319 }
   2320 template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); }
   2321 template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
   2322 {
   2323   const int16x8_t a_r64 = vrev64q_s16(a);
   2324   return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
   2325 }
   2326 template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); }
   2327 template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
   2328 {
   2329   const uint16x8_t a_r64 = vrev64q_u16(a);
   2330   return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
   2331 }
   2332 template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); }
   2333 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
   2334 {
   2335   const int32x4_t a_r64 = vrev64q_s32(a);
   2336   return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
   2337 }
   2338 template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); }
   2339 template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a)
   2340 {
   2341   const uint32x4_t a_r64 = vrev64q_u32(a);
   2342   return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
   2343 }
   2344 template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a)
   2345 { return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); }
   2346 template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a)
   2347 { return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); }
   2348 
   2349 template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); }
   2350 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
   2351 template<> EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a)
   2352 { return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
   2353 template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); }
   2354 template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); }
   2355 template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; }
   2356 template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; }
   2357 template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
   2358 template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); }
   2359 template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); }
   2360 template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; }
   2361 template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
   2362 template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); }
   2363 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
   2364 template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; }
   2365 template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
   2366 template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
   2367 #if EIGEN_ARCH_ARM64
   2368   return vabsq_s64(a);
   2369 #else
   2370   return vcombine_s64(
   2371       vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))),
   2372       vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
   2373 #endif
   2374 }
   2375 template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; }
   2376 
   2377 template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent)
   2378 { return pfrexp_generic(a,exponent); }
   2379 template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent)
   2380 { return pfrexp_generic(a,exponent); }
   2381 
   2382 template<> EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent)
   2383 { return pldexp_generic(a,exponent); }
   2384 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent)
   2385 { return pldexp_generic(a,exponent); }
   2386 
   2387 template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); }
   2388 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
   2389 {
   2390   const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
   2391   return vget_lane_f32(vpadd_f32(sum, sum), 0);
   2392 }
   2393 template<> EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a)
   2394 {
   2395   const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
   2396   int8x8_t sum = vpadd_s8(a_dup, a_dup);
   2397   sum = vpadd_s8(sum, sum);
   2398   return vget_lane_s8(sum, 0);
   2399 }
   2400 template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a)
   2401 {
   2402   int8x8_t sum = vpadd_s8(a,a);
   2403   sum = vpadd_s8(sum, sum);
   2404   sum = vpadd_s8(sum, sum);
   2405   return vget_lane_s8(sum, 0);
   2406 }
   2407 template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a)
   2408 {
   2409   int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
   2410   sum = vpadd_s8(sum, sum);
   2411   sum = vpadd_s8(sum, sum);
   2412   sum = vpadd_s8(sum, sum);
   2413   return vget_lane_s8(sum, 0);
   2414 }
   2415 template<> EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a)
   2416 {
   2417   const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
   2418   uint8x8_t sum = vpadd_u8(a_dup, a_dup);
   2419   sum = vpadd_u8(sum, sum);
   2420   return vget_lane_u8(sum, 0);
   2421 }
   2422 template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a)
   2423 {
   2424   uint8x8_t sum = vpadd_u8(a,a);
   2425   sum = vpadd_u8(sum, sum);
   2426   sum = vpadd_u8(sum, sum);
   2427   return vget_lane_u8(sum, 0);
   2428 }
   2429 template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a)
   2430 {
   2431   uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
   2432   sum = vpadd_u8(sum, sum);
   2433   sum = vpadd_u8(sum, sum);
   2434   sum = vpadd_u8(sum, sum);
   2435   return vget_lane_u8(sum, 0);
   2436 }
   2437 template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a)
   2438 {
   2439   const int16x4_t sum = vpadd_s16(a,a);
   2440   return vget_lane_s16(vpadd_s16(sum, sum), 0);
   2441 }
   2442 template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a)
   2443 {
   2444   int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
   2445   sum = vpadd_s16(sum, sum);
   2446   sum = vpadd_s16(sum, sum);
   2447   return vget_lane_s16(sum, 0);
   2448 }
   2449 template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a)
   2450 {
   2451   const uint16x4_t sum = vpadd_u16(a,a);
   2452   return vget_lane_u16(vpadd_u16(sum, sum), 0);
   2453 }
   2454 template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a)
   2455 {
   2456   uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
   2457   sum = vpadd_u16(sum, sum);
   2458   sum = vpadd_u16(sum, sum);
   2459   return vget_lane_u16(sum, 0);
   2460 }
   2461 template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); }
   2462 template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
   2463 {
   2464   const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
   2465   return vget_lane_s32(vpadd_s32(sum, sum), 0);
   2466 }
   2467 template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); }
   2468 template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
   2469 {
   2470   const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
   2471   return vget_lane_u32(vpadd_u32(sum, sum), 0);
   2472 }
   2473 template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a)
   2474 { return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); }
   2475 template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a)
   2476 { return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); }
   2477 
   2478 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a)
   2479 {
   2480   return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a,
   2481       vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
   2482 }
   2483 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a)
   2484 { return vadd_s8(vget_high_s8(a), vget_low_s8(a)); }
   2485 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a)
   2486 {
   2487   return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a,
   2488       vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
   2489 }
   2490 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a)
   2491 { return vadd_u8(vget_high_u8(a), vget_low_u8(a)); }
   2492 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a)
   2493 { return vadd_s16(vget_high_s16(a), vget_low_s16(a)); }
   2494 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a)
   2495 { return vadd_u16(vget_high_u16(a), vget_low_u16(a)); }
   2496 
   2497 // Other reduction functions:
   2498 // mul
   2499 template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
   2500 { return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
   2501 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
   2502 { return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
   2503 template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
   2504 {
   2505   int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
   2506   prod = vmul_s8(prod, vrev16_s8(prod));
   2507   return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
   2508 }
   2509 template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
   2510 {
   2511   int8x8_t prod = vmul_s8(a, vrev16_s8(a));
   2512   prod = vmul_s8(prod, vrev32_s8(prod));
   2513   return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
   2514 }
   2515 template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
   2516 { return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
   2517 template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
   2518 {
   2519   uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
   2520   prod = vmul_u8(prod, vrev16_u8(prod));
   2521   return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
   2522 }
   2523 template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
   2524 {
   2525   uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
   2526   prod = vmul_u8(prod, vrev32_u8(prod));
   2527   return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
   2528 }
   2529 template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
   2530 { return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
   2531 template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
   2532 {
   2533   const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
   2534   return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
   2535 }
   2536 template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a)
   2537 {
   2538   int16x4_t prod;
   2539 
   2540   // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
   2541   prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
   2542   // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
   2543   prod = vmul_s16(prod, vrev32_s16(prod));
   2544   // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
   2545   return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
   2546 }
   2547 template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a)
   2548 {
   2549   const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
   2550   return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
   2551 }
   2552 template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a)
   2553 {
   2554   uint16x4_t prod;
   2555 
   2556   // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
   2557   prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
   2558   // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
   2559   prod = vmul_u16(prod, vrev32_u16(prod));
   2560   // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
   2561   return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
   2562 }
   2563 template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
   2564 { return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
   2565 template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
   2566 { return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
   2567 template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
   2568 { return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
   2569 template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
   2570 { return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
   2571 template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
   2572 { return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
   2573 template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
   2574 { return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); }
   2575 
   2576 // min
   2577 template<> EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a)
   2578 { return vget_lane_f32(vpmin_f32(a,a), 0); }
   2579 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
   2580 {
   2581   const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
   2582   return vget_lane_f32(vpmin_f32(min, min), 0);
   2583 }
   2584 template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a)
   2585 {
   2586   const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
   2587   int8x8_t min = vpmin_s8(a_dup, a_dup);
   2588   min = vpmin_s8(min, min);
   2589   return vget_lane_s8(min, 0);
   2590 }
   2591 template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a)
   2592 {
   2593   int8x8_t min = vpmin_s8(a,a);
   2594   min = vpmin_s8(min, min);
   2595   min = vpmin_s8(min, min);
   2596   return vget_lane_s8(min, 0);
   2597 }
   2598 template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a)
   2599 {
   2600   int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
   2601   min = vpmin_s8(min, min);
   2602   min = vpmin_s8(min, min);
   2603   min = vpmin_s8(min, min);
   2604   return vget_lane_s8(min, 0);
   2605 }
   2606 template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a)
   2607 {
   2608   const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
   2609   uint8x8_t min = vpmin_u8(a_dup, a_dup);
   2610   min = vpmin_u8(min, min);
   2611   return vget_lane_u8(min, 0);
   2612 }
   2613 template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a)
   2614 {
   2615   uint8x8_t min = vpmin_u8(a,a);
   2616   min = vpmin_u8(min, min);
   2617   min = vpmin_u8(min, min);
   2618   return vget_lane_u8(min, 0);
   2619 }
   2620 template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a)
   2621 {
   2622   uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
   2623   min = vpmin_u8(min, min);
   2624   min = vpmin_u8(min, min);
   2625   min = vpmin_u8(min, min);
   2626   return vget_lane_u8(min, 0);
   2627 }
   2628 template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a)
   2629 {
   2630   const int16x4_t min = vpmin_s16(a,a);
   2631   return vget_lane_s16(vpmin_s16(min, min), 0);
   2632 }
   2633 template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a)
   2634 {
   2635   int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
   2636   min = vpmin_s16(min, min);
   2637   min = vpmin_s16(min, min);
   2638   return vget_lane_s16(min, 0);
   2639 }
   2640 template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a)
   2641 {
   2642   const uint16x4_t min = vpmin_u16(a,a);
   2643   return vget_lane_u16(vpmin_u16(min, min), 0);
   2644 }
   2645 template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a)
   2646 {
   2647   uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
   2648   min = vpmin_u16(min, min);
   2649   min = vpmin_u16(min, min);
   2650   return vget_lane_u16(min, 0);
   2651 }
   2652 template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a)
   2653 { return vget_lane_s32(vpmin_s32(a,a), 0); }
   2654 template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
   2655 {
   2656   const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
   2657   return vget_lane_s32(vpmin_s32(min, min), 0);
   2658 }
   2659 template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a)
   2660 { return vget_lane_u32(vpmin_u32(a,a), 0); }
   2661 template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
   2662 {
   2663   const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
   2664   return vget_lane_u32(vpmin_u32(min, min), 0);
   2665 }
   2666 template<> EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a)
   2667 { return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
   2668 template<> EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a)
   2669 { return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
   2670 
   2671 // max
   2672 template<> EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a)
   2673 { return vget_lane_f32(vpmax_f32(a,a), 0); }
   2674 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
   2675 {
   2676   const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
   2677   return vget_lane_f32(vpmax_f32(max, max), 0);
   2678 }
   2679 template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a)
   2680 {
   2681   const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
   2682   int8x8_t max = vpmax_s8(a_dup, a_dup);
   2683   max = vpmax_s8(max, max);
   2684   return vget_lane_s8(max, 0);
   2685 }
   2686 template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a)
   2687 {
   2688   int8x8_t max = vpmax_s8(a,a);
   2689   max = vpmax_s8(max, max);
   2690   max = vpmax_s8(max, max);
   2691   return vget_lane_s8(max, 0);
   2692 }
   2693 template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a)
   2694 {
   2695   int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
   2696   max = vpmax_s8(max, max);
   2697   max = vpmax_s8(max, max);
   2698   max = vpmax_s8(max, max);
   2699   return vget_lane_s8(max, 0);
   2700 }
   2701 template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a)
   2702 {
   2703   const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
   2704   uint8x8_t max = vpmax_u8(a_dup, a_dup);
   2705   max = vpmax_u8(max, max);
   2706   return vget_lane_u8(max, 0);
   2707 }
   2708 template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a)
   2709 {
   2710   uint8x8_t max = vpmax_u8(a,a);
   2711   max = vpmax_u8(max, max);
   2712   max = vpmax_u8(max, max);
   2713   return vget_lane_u8(max, 0);
   2714 }
   2715 template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a)
   2716 {
   2717   uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
   2718   max = vpmax_u8(max, max);
   2719   max = vpmax_u8(max, max);
   2720   max = vpmax_u8(max, max);
   2721   return vget_lane_u8(max, 0);
   2722 }
   2723 template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a)
   2724 {
   2725   const int16x4_t max = vpmax_s16(a,a);
   2726   return vget_lane_s16(vpmax_s16(max, max), 0);
   2727 }
   2728 template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a)
   2729 {
   2730   int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
   2731   max = vpmax_s16(max, max);
   2732   max = vpmax_s16(max, max);
   2733   return vget_lane_s16(max, 0);
   2734 }
   2735 template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a)
   2736 {
   2737   const uint16x4_t max = vpmax_u16(a,a);
   2738   return vget_lane_u16(vpmax_u16(max, max), 0);
   2739 }
   2740 template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a)
   2741 {
   2742   uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
   2743   max = vpmax_u16(max, max);
   2744   max = vpmax_u16(max, max);
   2745   return vget_lane_u16(max, 0);
   2746 }
   2747 template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a)
   2748 { return vget_lane_s32(vpmax_s32(a,a), 0); }
   2749 template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
   2750 {
   2751   const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
   2752   return vget_lane_s32(vpmax_s32(max, max), 0);
   2753 }
   2754 template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a)
   2755 { return vget_lane_u32(vpmax_u32(a,a), 0); }
   2756 template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
   2757 {
   2758   const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
   2759   return vget_lane_u32(vpmax_u32(max, max), 0);
   2760 }
   2761 template<> EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a)
   2762 { return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
   2763 template<> EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a)
   2764 { return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
   2765 
   2766 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
   2767 {
   2768   uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
   2769                             vget_high_u32(vreinterpretq_u32_f32(x)));
   2770   return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
   2771 }
   2772 
   2773 // Helpers for ptranspose.
   2774 namespace detail {
   2775   
   2776 template<typename Packet>
   2777 void zip_in_place(Packet& p1, Packet& p2);
   2778 
   2779 template<>
   2780 EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
   2781   const float32x2x2_t tmp = vzip_f32(p1, p2);
   2782   p1 = tmp.val[0];
   2783   p2 = tmp.val[1];
   2784 }
   2785 
   2786 template<>
   2787 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
   2788   const float32x4x2_t tmp = vzipq_f32(p1, p2);
   2789   p1 = tmp.val[0];
   2790   p2 = tmp.val[1];
   2791 }
   2792 
   2793 template<>
   2794 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
   2795   const int8x8x2_t tmp = vzip_s8(p1, p2);
   2796   p1 = tmp.val[0];
   2797   p2 = tmp.val[1];
   2798 }
   2799 
   2800 template<>
   2801 EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
   2802   const int8x16x2_t tmp = vzipq_s8(p1, p2);
   2803   p1 = tmp.val[0];
   2804   p2 = tmp.val[1];
   2805 }
   2806 
   2807 template<>
   2808 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
   2809   const uint8x8x2_t tmp = vzip_u8(p1, p2);
   2810   p1 = tmp.val[0];
   2811   p2 = tmp.val[1];
   2812 }
   2813 
   2814 template<>
   2815 EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
   2816   const uint8x16x2_t tmp = vzipq_u8(p1, p2);
   2817   p1 = tmp.val[0];
   2818   p2 = tmp.val[1];
   2819 }
   2820 
   2821 template<>
   2822 EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
   2823   const int32x2x2_t tmp = vzip_s32(p1, p2);
   2824   p1 = tmp.val[0];
   2825   p2 = tmp.val[1];
   2826 }
   2827 
   2828 template<>
   2829 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
   2830   const int32x4x2_t tmp = vzipq_s32(p1, p2);
   2831   p1 = tmp.val[0];
   2832   p2 = tmp.val[1];
   2833 }
   2834 
   2835 template<>
   2836 EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
   2837   const uint32x2x2_t tmp = vzip_u32(p1, p2);
   2838   p1 = tmp.val[0];
   2839   p2 = tmp.val[1];
   2840 }
   2841 
   2842 template<>
   2843 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
   2844   const uint32x4x2_t tmp = vzipq_u32(p1, p2);
   2845   p1 = tmp.val[0];
   2846   p2 = tmp.val[1];
   2847 }
   2848 
   2849 template<>
   2850 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
   2851   const int16x4x2_t tmp = vzip_s16(p1, p2);
   2852   p1 = tmp.val[0];
   2853   p2 = tmp.val[1];
   2854 }
   2855 
   2856 template<>
   2857 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
   2858   const int16x8x2_t tmp = vzipq_s16(p1, p2);
   2859   p1 = tmp.val[0];
   2860   p2 = tmp.val[1];
   2861 }
   2862 
   2863 template<>
   2864 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
   2865   const uint16x4x2_t tmp = vzip_u16(p1, p2);
   2866   p1 = tmp.val[0];
   2867   p2 = tmp.val[1];
   2868 }
   2869 
   2870 template<>
   2871 EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
   2872   const uint16x8x2_t tmp = vzipq_u16(p1, p2);
   2873   p1 = tmp.val[0];
   2874   p2 = tmp.val[1];
   2875 }
   2876 
   2877 template<typename Packet>
   2878 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
   2879   zip_in_place(kernel.packet[0], kernel.packet[1]);
   2880 }
   2881 
   2882 template<typename Packet>
   2883 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
   2884   zip_in_place(kernel.packet[0], kernel.packet[2]);
   2885   zip_in_place(kernel.packet[1], kernel.packet[3]);
   2886   zip_in_place(kernel.packet[0], kernel.packet[1]);
   2887   zip_in_place(kernel.packet[2], kernel.packet[3]);
   2888 }
   2889 
   2890 template<typename Packet>
   2891 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
   2892   zip_in_place(kernel.packet[0], kernel.packet[4]);
   2893   zip_in_place(kernel.packet[1], kernel.packet[5]);
   2894   zip_in_place(kernel.packet[2], kernel.packet[6]);
   2895   zip_in_place(kernel.packet[3], kernel.packet[7]);
   2896 
   2897   zip_in_place(kernel.packet[0], kernel.packet[2]);
   2898   zip_in_place(kernel.packet[1], kernel.packet[3]);
   2899   zip_in_place(kernel.packet[4], kernel.packet[6]);
   2900   zip_in_place(kernel.packet[5], kernel.packet[7]);
   2901   
   2902   zip_in_place(kernel.packet[0], kernel.packet[1]);
   2903   zip_in_place(kernel.packet[2], kernel.packet[3]);
   2904   zip_in_place(kernel.packet[4], kernel.packet[5]);
   2905   zip_in_place(kernel.packet[6], kernel.packet[7]);
   2906 }
   2907 
   2908 template<typename Packet>
   2909 EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
   2910   EIGEN_UNROLL_LOOP
   2911   for (int i=0; i<4; ++i) {
   2912     const int m = (1 << i);
   2913     EIGEN_UNROLL_LOOP
   2914     for (int j=0; j<m; ++j) {
   2915       const int n = (1 << (3-i));
   2916       EIGEN_UNROLL_LOOP
   2917       for (int k=0; k<n; ++k) {
   2918         const int idx = 2*j*n+k;
   2919         zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
   2920       }
   2921     }
   2922   }
   2923 }
   2924 
   2925 } // namespace detail
   2926 
   2927 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
   2928   detail::ptranspose_impl(kernel);
   2929 }
   2930 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
   2931   detail::ptranspose_impl(kernel);
   2932 }
   2933 
   2934 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel)
   2935 {
   2936   const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
   2937   const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
   2938 
   2939   const int8x8x2_t zip8 = vzip_s8(a,b);
   2940   const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
   2941 
   2942   kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
   2943   kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
   2944   kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
   2945   kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
   2946 }
   2947 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
   2948   detail::ptranspose_impl(kernel);
   2949 }
   2950 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
   2951   detail::ptranspose_impl(kernel);
   2952 }
   2953 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
   2954   detail::ptranspose_impl(kernel);
   2955 }
   2956 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
   2957   detail::ptranspose_impl(kernel);
   2958 }
   2959 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
   2960   detail::ptranspose_impl(kernel);
   2961 }
   2962 
   2963 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
   2964 {
   2965   const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
   2966   const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
   2967 
   2968   const uint8x8x2_t zip8 = vzip_u8(a,b);
   2969   const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
   2970 
   2971   kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
   2972   kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
   2973   kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
   2974   kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
   2975 }
   2976 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
   2977   detail::ptranspose_impl(kernel);
   2978 }
   2979 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
   2980   detail::ptranspose_impl(kernel);
   2981 }
   2982 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
   2983   detail::ptranspose_impl(kernel);
   2984 }
   2985 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
   2986   detail::ptranspose_impl(kernel);
   2987 }
   2988 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
   2989   detail::ptranspose_impl(kernel);
   2990 }
   2991 
   2992 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
   2993   detail::ptranspose_impl(kernel);
   2994 }
   2995 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
   2996   detail::ptranspose_impl(kernel);
   2997 }
   2998 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
   2999   detail::ptranspose_impl(kernel);
   3000 }
   3001 
   3002 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
   3003   detail::ptranspose_impl(kernel);
   3004 }
   3005 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
   3006   detail::ptranspose_impl(kernel);
   3007 }
   3008 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
   3009   detail::ptranspose_impl(kernel);
   3010 }
   3011 
   3012 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
   3013   detail::ptranspose_impl(kernel);
   3014 }
   3015 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
   3016     detail::ptranspose_impl(kernel);
   3017 }
   3018 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
   3019   detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
   3020 }
   3021 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
   3022   detail::ptranspose_impl(kernel);
   3023 }
   3024 
   3025 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
   3026 ptranspose(PacketBlock<Packet2l, 2>& kernel)
   3027 {
   3028 #if EIGEN_ARCH_ARM64
   3029   const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
   3030   kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
   3031   kernel.packet[0] = tmp1;
   3032 #else
   3033   const int64x1_t tmp[2][2] = {
   3034     { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) },
   3035     { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) }
   3036   };
   3037 
   3038   kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
   3039   kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
   3040 #endif
   3041 }
   3042 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
   3043 ptranspose(PacketBlock<Packet2ul, 2>& kernel)
   3044 {
   3045 #if EIGEN_ARCH_ARM64
   3046   const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
   3047   kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
   3048   kernel.packet[0] = tmp1;
   3049 #else
   3050   const uint64x1_t tmp[2][2] = {
   3051     { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) },
   3052     { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) }
   3053   };
   3054 
   3055   kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
   3056   kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
   3057 #endif
   3058 }
   3059 
   3060 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b)
   3061 { return vbsl_f32(vreinterpret_u32_f32(mask), a, b); }
   3062 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b)
   3063 { return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); }
   3064 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b)
   3065 { return vbsl_s8(vreinterpret_u8_s8(mask), a, b); }
   3066 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b)
   3067 { return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); }
   3068 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b)
   3069 { return vbsl_u8(mask, a, b); }
   3070 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b)
   3071 { return vbslq_u8(mask, a, b); }
   3072 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b)
   3073 { return vbsl_s16(vreinterpret_u16_s16(mask), a, b); }
   3074 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b)
   3075 { return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); }
   3076 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b)
   3077 { return vbsl_u16(mask, a, b); }
   3078 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b)
   3079 { return vbslq_u16(mask, a, b); }
   3080 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b)
   3081 { return vbsl_s32(vreinterpret_u32_s32(mask), a, b); }
   3082 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b)
   3083 { return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); }
   3084 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b)
   3085 { return vbsl_u32(mask, a, b); }
   3086 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b)
   3087 { return vbslq_u32(mask, a, b); }
   3088 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b)
   3089 { return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); }
   3090 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
   3091 { return vbslq_u64(mask, a, b); }
   3092 
   3093 // Use armv8 rounding intinsics if available.
   3094 #if EIGEN_ARCH_ARMV8
   3095 template<> EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a)
   3096 { return vrndn_f32(a); }
   3097 
   3098 template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
   3099 { return vrndnq_f32(a); }
   3100 
   3101 template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
   3102 { return vrndm_f32(a); }
   3103 
   3104 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
   3105 { return vrndmq_f32(a); }
   3106 
   3107 template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
   3108 { return vrndp_f32(a); }
   3109 
   3110 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
   3111 { return vrndpq_f32(a); }
   3112 
   3113 #else
   3114 
   3115 template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
   3116   // Adds and subtracts signum(a) * 2^23 to force rounding.
   3117   const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
   3118   const Packet4f abs_a = pabs(a);
   3119   Packet4f r = padd(abs_a, limit);
   3120   // Don't compile-away addition and subtraction.
   3121   EIGEN_OPTIMIZATION_BARRIER(r);
   3122   r = psub(r, limit);
   3123   // If greater than limit, simply return a.  Otherwise, account for sign.
   3124   r = pselect(pcmp_lt(abs_a, limit),
   3125               pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
   3126   return r;
   3127 }
   3128 
   3129 template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
   3130   // Adds and subtracts signum(a) * 2^23 to force rounding.
   3131   const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
   3132   const Packet2f abs_a = pabs(a);
   3133   Packet2f r = padd(abs_a, limit);
   3134   // Don't compile-away addition and subtraction.
   3135   EIGEN_OPTIMIZATION_BARRIER(r);
   3136   r = psub(r, limit);
   3137   // If greater than limit, simply return a.  Otherwise, account for sign.
   3138   r = pselect(pcmp_lt(abs_a, limit),
   3139               pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
   3140   return r;
   3141 }
   3142 
   3143 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
   3144 {
   3145   const Packet4f cst_1 = pset1<Packet4f>(1.0f);
   3146   Packet4f tmp  = print<Packet4f>(a);
   3147   // If greater, subtract one.
   3148   Packet4f mask = pcmp_lt(a, tmp);
   3149   mask = pand(mask, cst_1);
   3150   return psub(tmp, mask);
   3151 }
   3152 
   3153 template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
   3154 {
   3155   const Packet2f cst_1 = pset1<Packet2f>(1.0f);
   3156   Packet2f tmp  = print<Packet2f>(a);
   3157   // If greater, subtract one.
   3158   Packet2f mask = pcmp_lt(a, tmp);
   3159   mask = pand(mask, cst_1);
   3160   return psub(tmp, mask);
   3161 }
   3162 
   3163 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
   3164 {
   3165   const Packet4f cst_1 = pset1<Packet4f>(1.0f);
   3166   Packet4f tmp  = print<Packet4f>(a);
   3167   // If smaller, add one.
   3168   Packet4f mask = pcmp_lt(tmp, a);
   3169   mask = pand(mask, cst_1);
   3170   return padd(tmp, mask);
   3171 }
   3172 
   3173 template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
   3174 {
   3175   const Packet2f cst_1 = pset1<Packet2f>(1.0);
   3176   Packet2f tmp  = print<Packet2f>(a);
   3177   // If smaller, add one.
   3178   Packet2f mask = pcmp_lt(tmp, a);
   3179   mask = pand(mask, cst_1);
   3180   return padd(tmp, mask);
   3181 }
   3182 
   3183 #endif
   3184 
   3185 /**
   3186  * Computes the integer square root
   3187  * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result
   3188  *   and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument
   3189  *   value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf .
   3190  */
   3191 template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
   3192   uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
   3193   uint8x8_t res = vdup_n_u8(0);
   3194   uint8x8_t add = vdup_n_u8(0x8);
   3195   for (int i = 0; i < 4; i++)
   3196   {
   3197     const uint8x8_t temp = vorr_u8(res, add);
   3198     res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
   3199     add = vshr_n_u8(add, 1);
   3200   }
   3201   return vget_lane_u32(vreinterpret_u32_u8(res), 0);
   3202 }
   3203 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
   3204 template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
   3205   uint8x8_t res = vdup_n_u8(0);
   3206   uint8x8_t add = vdup_n_u8(0x8);
   3207   for (int i = 0; i < 4; i++)
   3208   {
   3209     const uint8x8_t temp = vorr_u8(res, add);
   3210     res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
   3211     add = vshr_n_u8(add, 1);
   3212   }
   3213   return res;
   3214 }
   3215 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
   3216 template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
   3217   uint8x16_t res = vdupq_n_u8(0);
   3218   uint8x16_t add = vdupq_n_u8(0x8);
   3219   for (int i = 0; i < 4; i++)
   3220   {
   3221     const uint8x16_t temp = vorrq_u8(res, add);
   3222     res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
   3223     add = vshrq_n_u8(add, 1);
   3224   }
   3225   return res;
   3226 }
   3227 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
   3228 template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
   3229   uint16x4_t res = vdup_n_u16(0);
   3230   uint16x4_t add = vdup_n_u16(0x80);
   3231   for (int i = 0; i < 8; i++)
   3232   {
   3233     const uint16x4_t temp = vorr_u16(res, add);
   3234     res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
   3235     add = vshr_n_u16(add, 1);
   3236   }
   3237   return res;
   3238 }
   3239 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
   3240 template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
   3241   uint16x8_t res = vdupq_n_u16(0);
   3242   uint16x8_t add = vdupq_n_u16(0x80);
   3243   for (int i = 0; i < 8; i++)
   3244   {
   3245     const uint16x8_t temp = vorrq_u16(res, add);
   3246     res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
   3247     add = vshrq_n_u16(add, 1);
   3248   }
   3249   return res;
   3250 }
   3251 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
   3252 template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
   3253   uint32x2_t res = vdup_n_u32(0);
   3254   uint32x2_t add = vdup_n_u32(0x8000);
   3255   for (int i = 0; i < 16; i++)
   3256   {
   3257     const uint32x2_t temp = vorr_u32(res, add);
   3258     res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
   3259     add = vshr_n_u32(add, 1);
   3260   }
   3261   return res;
   3262 }
   3263 /// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
   3264 template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
   3265   uint32x4_t res = vdupq_n_u32(0);
   3266   uint32x4_t add = vdupq_n_u32(0x8000);
   3267   for (int i = 0; i < 16; i++)
   3268   {
   3269     const uint32x4_t temp = vorrq_u32(res, add);
   3270     res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
   3271     add = vshrq_n_u32(add, 1);
   3272   }
   3273   return res;
   3274 }
   3275 
   3276 template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
   3277   // Compute approximate reciprocal sqrt.
   3278   Packet4f x = vrsqrteq_f32(a);
   3279   // Do Newton iterations for 1/sqrt(x).
   3280   x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
   3281   x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
   3282   const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
   3283   return pselect(pcmp_eq(a, pzero(a)), infinity, x);
   3284 }
   3285 
   3286 template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
   3287   // Compute approximate reciprocal sqrt.
   3288   Packet2f x = vrsqrte_f32(a);
   3289   // Do Newton iterations for 1/sqrt(x).
   3290   x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
   3291   x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
   3292   const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
   3293   return pselect(pcmp_eq(a, pzero(a)), infinity, x);
   3294 }
   3295 
   3296 // Unfortunately vsqrt_f32 is only available for A64.
   3297 #if EIGEN_ARCH_ARM64
   3298 template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);}
   3299 template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); }
   3300 #else
   3301 template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
   3302   const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
   3303   const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
   3304   return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
   3305 }
   3306 template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
   3307   const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
   3308   const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
   3309   return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
   3310 }
   3311 #endif
   3312 
   3313 //---------- bfloat16 ----------
   3314 // TODO: Add support for native armv8.6-a bfloat16_t
   3315 
   3316 // TODO: Guard if we have native bfloat16 support
   3317 typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
   3318 
   3319 template<> struct is_arithmetic<Packet4bf> { enum { value = true }; };
   3320 
   3321 template<> struct packet_traits<bfloat16> : default_packet_traits
   3322 {
   3323   typedef Packet4bf type;
   3324   typedef Packet4bf half;
   3325   enum
   3326   {
   3327     Vectorizable = 1,
   3328     AlignedOnScalar = 1,
   3329     size = 4,
   3330     HasHalfPacket = 0,
   3331 
   3332     HasCmp       = 1,
   3333     HasAdd       = 1,
   3334     HasSub       = 1,
   3335     HasShift     = 1,
   3336     HasMul       = 1,
   3337     HasNegate    = 1,
   3338     HasAbs       = 1,
   3339     HasArg       = 0,
   3340     HasAbs2      = 1,
   3341     HasAbsDiff   = 1,
   3342     HasMin       = 1,
   3343     HasMax       = 1,
   3344     HasConj      = 1,
   3345     HasSetLinear = 0,
   3346     HasBlend     = 0,
   3347     HasDiv       = 1,
   3348     HasFloor     = 1,
   3349     HasCeil      = 1,
   3350     HasRint      = 1,
   3351 
   3352     HasSin  = EIGEN_FAST_MATH,
   3353     HasCos  = EIGEN_FAST_MATH,
   3354     HasLog  = 1,
   3355     HasExp  = 1,
   3356     HasSqrt = 0,
   3357     HasTanh = EIGEN_FAST_MATH,
   3358     HasErf  = EIGEN_FAST_MATH,
   3359     HasBessel = 0,  // Issues with accuracy.
   3360     HasNdtri = 0
   3361   };
   3362 };
   3363 
   3364 template<> struct unpacket_traits<Packet4bf>
   3365 {
   3366   typedef bfloat16 type;
   3367   typedef Packet4bf half;
   3368   enum
   3369   {
   3370     size = 4,
   3371     alignment = Aligned16,
   3372     vectorizable = true,
   3373     masked_load_available = false,
   3374     masked_store_available = false
   3375   };
   3376 };
   3377 
   3378 namespace detail {  
   3379 template<>
   3380 EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
   3381   const uint16x4x2_t tmp = vzip_u16(p1, p2);
   3382   p1 = tmp.val[0];
   3383   p2 = tmp.val[1];
   3384 }
   3385 } // namespace detail
   3386 
   3387 EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
   3388 {
   3389   // See the scalar implemention in BFloat16.h for a comprehensible explanation
   3390   // of this fast rounding algorithm
   3391   Packet4ui input = reinterpret_cast<Packet4ui>(p);
   3392 
   3393   // lsb = (input >> 16) & 1
   3394   Packet4ui lsb =  vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
   3395 
   3396   // rounding_bias = 0x7fff + lsb
   3397   Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
   3398 
   3399   // input += rounding_bias
   3400   input = vaddq_u32(input, rounding_bias);
   3401 
   3402   // input = input >> 16
   3403   input = vshrq_n_u32(input, 16);
   3404 
   3405   // Replace float-nans by bfloat16-nans, that is 0x7fc0
   3406   const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
   3407   const Packet4ui mask = vceqq_f32(p, p);
   3408   input = vbslq_u32(mask, input, bf16_nan);
   3409 
   3410   // output = static_cast<uint16_t>(input)
   3411   return vmovn_u32(input);
   3412 }
   3413 
   3414 EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
   3415 {
   3416   return reinterpret_cast<Packet4f>(vshlq_n_u32(vmovl_u16(p), 16));
   3417 }
   3418 
   3419 EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
   3420   return vmovn_u32(vreinterpretq_u32_f32(p));
   3421 }
   3422 
   3423 template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
   3424   return pset1<Packet4us>(from.value);
   3425 }
   3426 
   3427 template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
   3428   return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(from)));
   3429 }
   3430 
   3431 template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
   3432 {
   3433   return pload<Packet4us>(reinterpret_cast<const uint16_t*>(from));
   3434 }
   3435 
   3436 template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
   3437 {
   3438   return ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from));
   3439 }
   3440 
   3441 template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
   3442 {
   3443   EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
   3444 }
   3445 
   3446 template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from)
   3447 {
   3448   EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
   3449 }
   3450 
   3451 template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
   3452 {
   3453   return ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from));
   3454 }
   3455 
   3456 template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
   3457   return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
   3458 }
   3459 
   3460 template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf &a,
   3461                                                                             const Packet4bf &b)
   3462 {
   3463   return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3464 }
   3465 template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf &a,
   3466                                                                         const Packet4bf &b)
   3467 {
   3468   return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3469 }
   3470 
   3471 template <> EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf &a,
   3472                                                           const Packet4bf &b)
   3473 {
   3474   return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3475 }
   3476 
   3477 template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf &a,
   3478                                                                             const Packet4bf &b)
   3479 {
   3480   return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3481 }
   3482 template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf &a,
   3483                                                                         const Packet4bf &b)
   3484 {
   3485   return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3486 }
   3487 
   3488 template <> EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf &a,
   3489                                                           const Packet4bf &b)
   3490 {
   3491   return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3492 }
   3493 
   3494 template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a)
   3495 {
   3496   return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
   3497 }
   3498 
   3499 template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
   3500   return por<Packet4us>(a, b);
   3501 }
   3502 
   3503 template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
   3504   return pxor<Packet4us>(a, b);
   3505 }
   3506 
   3507 template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
   3508   return pand<Packet4us>(a, b);
   3509 }
   3510 
   3511 template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
   3512   return pandnot<Packet4us>(a, b);
   3513 }
   3514 
   3515 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
   3516                                                       const Packet4bf& b)
   3517 {
   3518   return pselect<Packet4us>(mask, a, b);
   3519 }
   3520 
   3521 template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
   3522 {
   3523   return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
   3524 }
   3525 
   3526 template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a)
   3527 {
   3528   return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
   3529 }
   3530 
   3531 template<> EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a)
   3532 {
   3533   return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
   3534 }
   3535 
   3536 template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; }
   3537 
   3538 template<> EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   3539   return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3540 }
   3541 
   3542 template<> EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   3543   return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3544 }
   3545 
   3546 template<> EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   3547   return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3548 }
   3549 
   3550 template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
   3551   return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3552 }
   3553 
   3554 template<>
   3555 EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
   3556 {
   3557   return pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride);
   3558 }
   3559 
   3560 template<>
   3561 EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
   3562 {
   3563   pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), from, stride);
   3564 }
   3565 
   3566 template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
   3567 {
   3568   return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
   3569 }
   3570 
   3571 template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a)
   3572 {
   3573   return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
   3574 }
   3575 
   3576 template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a)
   3577 {
   3578   return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
   3579 }
   3580 
   3581 template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a)
   3582 {
   3583   return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
   3584 }
   3585 
   3586 template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
   3587 {
   3588   return preverse<Packet4us>(a);
   3589 }
   3590 
   3591 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
   3592 {
   3593   detail::ptranspose_impl(kernel);
   3594 }
   3595 
   3596 template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
   3597 {
   3598   return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3599 }
   3600 
   3601 template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
   3602 {
   3603   return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3604 }
   3605 
   3606 template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
   3607 {
   3608   return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3609 }
   3610 
   3611 template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
   3612 {
   3613   return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3614 }
   3615 
   3616 template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
   3617 {
   3618   return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
   3619 }
   3620 
   3621 template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
   3622 {
   3623   return pxor<Packet4us>(a, pset1<Packet4us>(static_cast<uint16_t>(0x8000)));
   3624 }
   3625 
   3626 //---------- double ----------
   3627 
   3628 // Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double.
   3629 // Confirmed at least with __apple_build_version__ = 6000054.
   3630 #ifdef __apple_build_version__
   3631 // Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
   3632 // https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
   3633 // major toolchain updates.
   3634 #define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000)
   3635 #else
   3636 #define EIGEN_APPLE_DOUBLE_NEON_BUG 0
   3637 #endif
   3638 
   3639 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
   3640 
   3641 // Bug 907: workaround missing declarations of the following two functions in the ADK
   3642 // Defining these functions as templates ensures that if these intrinsics are
   3643 // already defined in arm_neon.h, then our workaround doesn't cause a conflict
   3644 // and has lower priority in overload resolution.
   3645 template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
   3646 
   3647 template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
   3648 
   3649 typedef float64x2_t Packet2d;
   3650 typedef float64x1_t Packet1d;
   3651 
   3652 // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
   3653 // Currently used in LU/arch/InverseSize4.h to enable a shared implementation
   3654 // for fast inversion of matrices of size 4.
   3655 EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask)
   3656 {
   3657   const double* a = reinterpret_cast<const double*>(&m);
   3658   const double* b = reinterpret_cast<const double*>(&n);
   3659   Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
   3660   return res;
   3661 }
   3662 
   3663 EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask)
   3664 {
   3665   return shuffle(a, b, mask);
   3666 }
   3667 EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b)
   3668 {
   3669   return shuffle(a, b, 0);
   3670 }
   3671 EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
   3672 {
   3673   return shuffle(a, b, 3);
   3674 }
   3675 #define vec2d_duplane(a, p) \
   3676   vdupq_laneq_f64(a, p)
   3677 
   3678 template<> struct packet_traits<double>  : default_packet_traits
   3679 {
   3680   typedef Packet2d type;
   3681   typedef Packet2d half;
   3682   enum
   3683   {
   3684     Vectorizable = 1,
   3685     AlignedOnScalar = 1,
   3686     size = 2,
   3687     HasHalfPacket = 0,
   3688 
   3689     HasCmp       = 1,
   3690     HasAdd       = 1,
   3691     HasSub       = 1,
   3692     HasShift     = 1,
   3693     HasMul       = 1,
   3694     HasNegate    = 1,
   3695     HasAbs       = 1,
   3696     HasArg       = 0,
   3697     HasAbs2      = 1,
   3698     HasAbsDiff   = 1,
   3699     HasMin       = 1,
   3700     HasMax       = 1,
   3701     HasConj      = 1,
   3702     HasSetLinear = 0,
   3703     HasBlend     = 0,
   3704 
   3705     HasDiv   = 1,
   3706     HasFloor = 1,
   3707     HasCeil = 1,
   3708     HasRint = 1,
   3709 
   3710     HasSin  = 0,
   3711     HasCos  = 0,
   3712     HasLog  = 1,
   3713     HasExp  = 1,
   3714     HasSqrt = 1,
   3715     HasRsqrt = 1,
   3716     HasTanh = 0,
   3717     HasErf  = 0
   3718   };
   3719 };
   3720 
   3721 template<> struct unpacket_traits<Packet2d>
   3722 {
   3723   typedef double type;
   3724   typedef Packet2d half;
   3725   typedef Packet2l integer_packet;
   3726   enum
   3727   {
   3728     size = 2,
   3729     alignment = Aligned16,
   3730     vectorizable = true,
   3731     masked_load_available = false,
   3732     masked_store_available = false
   3733   };
   3734 };
   3735 
   3736 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }
   3737 
   3738 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
   3739 {
   3740   const double c[] = {0.0,1.0};
   3741   return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
   3742 }
   3743 
   3744 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
   3745 
   3746 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
   3747 
   3748 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
   3749 template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
   3750   const Packet2d mask = {numext::bit_cast<double>(0x8000000000000000ull),0.0};
   3751   return padd(a, pxor(mask, b));
   3752 }
   3753 
   3754 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
   3755 
   3756 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
   3757 
   3758 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
   3759 
   3760 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
   3761 
   3762 #ifdef __ARM_FEATURE_FMA
   3763 // See bug 936. See above comment about FMA for float.
   3764 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
   3765 { return vfmaq_f64(c,a,b); }
   3766 #else
   3767 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
   3768 { return vmlaq_f64(c,a,b); }
   3769 #endif
   3770 
   3771 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
   3772 
   3773 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
   3774 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
   3775 template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); }
   3776 template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); }
   3777 
   3778 #endif
   3779 
   3780 template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmin<Packet2d>(a, b); }
   3781 
   3782 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
   3783 
   3784 
   3785 template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmax<Packet2d>(a, b); }
   3786 
   3787 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
   3788 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
   3789 { return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
   3790 
   3791 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
   3792 { return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
   3793 
   3794 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
   3795 { return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
   3796 
   3797 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
   3798 { return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
   3799 
   3800 template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b)
   3801 { return vreinterpretq_f64_u64(vcleq_f64(a,b)); }
   3802 
   3803 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b)
   3804 { return vreinterpretq_f64_u64(vcltq_f64(a,b)); }
   3805 
   3806 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b)
   3807 { return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); }
   3808 
   3809 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b)
   3810 { return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
   3811 
   3812 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
   3813 { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
   3814 
   3815 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
   3816 { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
   3817 
   3818 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) { return vld1q_dup_f64(from); }
   3819 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
   3820 { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); }
   3821 
   3822 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
   3823 { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); }
   3824 
   3825 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
   3826 {
   3827   Packet2d res = pset1<Packet2d>(0.0);
   3828   res = vld1q_lane_f64(from + 0*stride, res, 0);
   3829   res = vld1q_lane_f64(from + 1*stride, res, 1);
   3830   return res;
   3831 }
   3832 
   3833 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
   3834 {
   3835   vst1q_lane_f64(to + stride*0, from, 0);
   3836   vst1q_lane_f64(to + stride*1, from, 1);
   3837 }
   3838 
   3839 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
   3840 
   3841 // FIXME only store the 2 first elements ?
   3842 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a,0); }
   3843 
   3844 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
   3845 { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
   3846 
   3847 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
   3848 
   3849 #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
   3850 // workaround ICE, see bug 907
   3851 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
   3852 { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
   3853 #else
   3854 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
   3855 { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
   3856 #endif
   3857 
   3858 // Other reduction functions:
   3859 // mul
   3860 #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
   3861 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
   3862 { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
   3863 #else
   3864 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
   3865 { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
   3866 #endif
   3867 
   3868 // min
   3869 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
   3870 { return vgetq_lane_f64(vpminq_f64(a,a), 0); }
   3871 
   3872 // max
   3873 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
   3874 { return vgetq_lane_f64(vpmaxq_f64(a,a), 0); }
   3875 
   3876 
   3877 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
   3878 ptranspose(PacketBlock<Packet2d, 2>& kernel)
   3879 {
   3880   const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
   3881   const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
   3882 
   3883   kernel.packet[0] = tmp1;
   3884   kernel.packet[1] = tmp2;
   3885 }
   3886 
   3887 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
   3888 { return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
   3889 
   3890 template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
   3891 { return vrndnq_f64(a); }
   3892 
   3893 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
   3894 { return vrndmq_f64(a); }
   3895 
   3896 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
   3897 { return vrndpq_f64(a); }
   3898 
   3899 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
   3900 { return pldexp_generic(a, exponent); }
   3901 
   3902 template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent)
   3903 { return pfrexp_generic(a,exponent); }
   3904 
   3905 template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from)
   3906 { return vreinterpretq_f64_u64(vdupq_n_u64(from)); }
   3907 
   3908 template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
   3909   // Compute approximate reciprocal sqrt.
   3910   Packet2d x = vrsqrteq_f64(a);
   3911   // Do Newton iterations for 1/sqrt(x).
   3912   x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
   3913   x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
   3914   x = vmulq_f64(vrsqrtsq_f64(vmulq_f64(a, x), x), x);
   3915   const Packet2d infinity = pset1<Packet2d>(NumTraits<double>::infinity());
   3916   return pselect(pcmp_eq(a, pzero(a)), infinity, x);
   3917 }
   3918 
   3919 template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
   3920 
   3921 #endif // EIGEN_ARCH_ARM64
   3922 
   3923 // Do we have an fp16 types and supporting Neon intrinsics?
   3924 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
   3925 typedef float16x4_t Packet4hf;
   3926 typedef float16x8_t Packet8hf;
   3927 
   3928 template <>
   3929 struct packet_traits<Eigen::half> : default_packet_traits {
   3930   typedef Packet8hf type;
   3931   typedef Packet4hf half;
   3932   enum {
   3933     Vectorizable = 1,
   3934     AlignedOnScalar = 1,
   3935     size = 8,
   3936     HasHalfPacket = 1,
   3937 
   3938     HasCmp = 1,
   3939     HasCast = 1,
   3940     HasAdd = 1,
   3941     HasSub = 1,
   3942     HasShift = 1,
   3943     HasMul = 1,
   3944     HasNegate = 1,
   3945     HasAbs = 1,
   3946     HasArg = 0,
   3947     HasAbs2 = 1,
   3948     HasAbsDiff = 0,
   3949     HasMin = 1,
   3950     HasMax = 1,
   3951     HasConj = 1,
   3952     HasSetLinear = 0,
   3953     HasBlend = 0,
   3954     HasInsert = 1,
   3955     HasReduxp = 1,
   3956     HasDiv = 1,
   3957     HasFloor = 1,
   3958     HasCeil = 1,
   3959     HasRint = 1,
   3960     HasSin = 0,
   3961     HasCos = 0,
   3962     HasLog = 0,
   3963     HasExp = 0,
   3964     HasSqrt = 1,
   3965     HasRsqrt = 1,
   3966     HasErf = EIGEN_FAST_MATH,
   3967     HasBessel = 0,  // Issues with accuracy.
   3968     HasNdtri = 0
   3969   };
   3970 };
   3971 
   3972 template <>
   3973 struct unpacket_traits<Packet4hf> {
   3974   typedef Eigen::half type;
   3975   typedef Packet4hf half;
   3976   enum {
   3977     size = 4,
   3978     alignment = Aligned16,
   3979     vectorizable = true,
   3980     masked_load_available = false,
   3981     masked_store_available = false
   3982   };
   3983 };
   3984 
   3985 template <>
   3986 struct unpacket_traits<Packet8hf> {
   3987   typedef Eigen::half type;
   3988   typedef Packet4hf half;
   3989   enum {
   3990     size = 8,
   3991     alignment = Aligned16,
   3992     vectorizable = true,
   3993     masked_load_available = false,
   3994     masked_store_available = false
   3995   };
   3996 };
   3997 
   3998 template<>
   3999 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
   4000   return vadd_f16(vget_low_f16(a), vget_high_f16(a));
   4001 }
   4002 
   4003 template <>
   4004 EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
   4005   return vdupq_n_f16(from.x);
   4006 }
   4007 
   4008 template <>
   4009 EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(const Eigen::half& from) {
   4010   return vdup_n_f16(from.x);
   4011 }
   4012 
   4013 template <>
   4014 EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(const Eigen::half& a) {
   4015   const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
   4016   Packet8hf countdown = vld1q_f16(f);
   4017   return vaddq_f16(pset1<Packet8hf>(a), countdown);
   4018 }
   4019 
   4020 template <>
   4021 EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(const Eigen::half& a) {
   4022   const float16_t f[] = {0, 1, 2, 3};
   4023   Packet4hf countdown = vld1_f16(f);
   4024   return vadd_f16(pset1<Packet4hf>(a), countdown);
   4025 }
   4026 
   4027 template <>
   4028 EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   4029   return vaddq_f16(a, b);
   4030 }
   4031 
   4032 template <>
   4033 EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
   4034   return vadd_f16(a, b);
   4035 }
   4036 
   4037 template <>
   4038 EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   4039   return vsubq_f16(a, b);
   4040 }
   4041 
   4042 template <>
   4043 EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
   4044   return vsub_f16(a, b);
   4045 }
   4046 
   4047 template <>
   4048 EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) {
   4049   return vnegq_f16(a);
   4050 }
   4051 
   4052 template <>
   4053 EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) {
   4054   return vneg_f16(a);
   4055 }
   4056 
   4057 template <>
   4058 EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) {
   4059   return a;
   4060 }
   4061 
   4062 template <>
   4063 EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) {
   4064   return a;
   4065 }
   4066 
   4067 template <>
   4068 EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   4069   return vmulq_f16(a, b);
   4070 }
   4071 
   4072 template <>
   4073 EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
   4074   return vmul_f16(a, b);
   4075 }
   4076 
   4077 template <>
   4078 EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   4079   return vdivq_f16(a, b);
   4080 }
   4081 
   4082 template <>
   4083 EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
   4084   return vdiv_f16(a, b);
   4085 }
   4086 
   4087 template <>
   4088 EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
   4089   return vfmaq_f16(c, a, b);
   4090 }
   4091 
   4092 template <>
   4093 EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
   4094   return vfma_f16(c, a, b);
   4095 }
   4096 
   4097 template <>
   4098 EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   4099   return vminq_f16(a, b);
   4100 }
   4101 
   4102 template <>
   4103 EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
   4104   return vmin_f16(a, b);
   4105 }
   4106 
   4107 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
   4108 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
   4109 template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); }
   4110 template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); }
   4111 #endif
   4112 
   4113 template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmin<Packet4hf>(a, b); }
   4114 
   4115 template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmin<Packet8hf>(a, b); }
   4116 
   4117 template <>
   4118 EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   4119   return vmaxq_f16(a, b);
   4120 }
   4121 
   4122 template <>
   4123 EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
   4124   return vmax_f16(a, b);
   4125 }
   4126 
   4127 #ifdef __ARM_FEATURE_NUMERIC_MAXMIN
   4128 // numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
   4129 template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); }
   4130 template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); }
   4131 #endif
   4132 
   4133 template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmax<Packet4hf>(a, b); }
   4134 
   4135 template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmax<Packet8hf>(a, b); }
   4136 
   4137 #define EIGEN_MAKE_ARM_FP16_CMP_8(name)                                               \
   4138   template <>                                                                         \
   4139   EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
   4140     return vreinterpretq_f16_u16(vc##name##q_f16(a, b));                              \
   4141   }
   4142 
   4143 #define EIGEN_MAKE_ARM_FP16_CMP_4(name)                                               \
   4144   template <>                                                                         \
   4145   EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
   4146     return vreinterpret_f16_u16(vc##name##_f16(a, b));                                \
   4147   }
   4148 
   4149 EIGEN_MAKE_ARM_FP16_CMP_8(eq)
   4150 EIGEN_MAKE_ARM_FP16_CMP_8(lt)
   4151 EIGEN_MAKE_ARM_FP16_CMP_8(le)
   4152 
   4153 EIGEN_MAKE_ARM_FP16_CMP_4(eq)
   4154 EIGEN_MAKE_ARM_FP16_CMP_4(lt)
   4155 EIGEN_MAKE_ARM_FP16_CMP_4(le)
   4156 
   4157 #undef EIGEN_MAKE_ARM_FP16_CMP_8
   4158 #undef EIGEN_MAKE_ARM_FP16_CMP_4
   4159 
   4160 template <>
   4161 EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   4162   return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
   4163 }
   4164 
   4165 template <>
   4166 EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
   4167   return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
   4168 }
   4169 
   4170 template <>
   4171 EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a)
   4172 { return vrndnq_f16(a); }
   4173 
   4174 template <>
   4175 EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a)
   4176 { return vrndn_f16(a); }
   4177 
   4178 template <>
   4179 EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a)
   4180 { return vrndmq_f16(a); }
   4181 
   4182 template <>
   4183 EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a)
   4184 { return vrndm_f16(a); }
   4185 
   4186 template <>
   4187 EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a)
   4188 { return vrndpq_f16(a); }
   4189 
   4190 template <>
   4191 EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a)
   4192 { return vrndp_f16(a); }
   4193 
   4194 template <>
   4195 EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
   4196   return vsqrtq_f16(a);
   4197 }
   4198 
   4199 template <>
   4200 EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(const Packet4hf& a) {
   4201   return vsqrt_f16(a);
   4202 }
   4203 
   4204 template <>
   4205 EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   4206   return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
   4207 }
   4208 
   4209 template <>
   4210 EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
   4211   return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
   4212 }
   4213 
   4214 template <>
   4215 EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   4216   return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
   4217 }
   4218 
   4219 template <>
   4220 EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
   4221   return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
   4222 }
   4223 
   4224 template <>
   4225 EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   4226   return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
   4227 }
   4228 
   4229 template <>
   4230 EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
   4231   return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
   4232 }
   4233 
   4234 template <>
   4235 EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
   4236   return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
   4237 }
   4238 
   4239 template <>
   4240 EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
   4241   return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
   4242 }
   4243 
   4244 template <>
   4245 EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
   4246   EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
   4247 }
   4248 
   4249 template <>
   4250 EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
   4251   EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
   4252 }
   4253 
   4254 template <>
   4255 EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(const Eigen::half* from) {
   4256   EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
   4257 }
   4258 
   4259 template <>
   4260 EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(const Eigen::half* from) {
   4261   EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
   4262 }
   4263 
   4264 template <>
   4265 EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(const Eigen::half* from) {
   4266   Packet8hf packet;
   4267   packet[0] = from[0].x;
   4268   packet[1] = from[0].x;
   4269   packet[2] = from[1].x;
   4270   packet[3] = from[1].x;
   4271   packet[4] = from[2].x;
   4272   packet[5] = from[2].x;
   4273   packet[6] = from[3].x;
   4274   packet[7] = from[3].x;
   4275   return packet;
   4276 }
   4277 
   4278 template <>
   4279 EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(const Eigen::half* from) {
   4280   float16x4_t packet;
   4281   float16_t* tmp;
   4282   tmp = (float16_t*)&packet;
   4283   tmp[0] = from[0].x;
   4284   tmp[1] = from[0].x;
   4285   tmp[2] = from[1].x;
   4286   tmp[3] = from[1].x;
   4287   return packet;
   4288 }
   4289 
   4290 template <>
   4291 EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
   4292   Packet4hf lo, hi;
   4293   lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
   4294   hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from+1));
   4295   return vcombine_f16(lo, hi);
   4296 }
   4297 
   4298 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); }
   4299 
   4300 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); }
   4301 
   4302 template <>
   4303 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
   4304   return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
   4305 }
   4306 
   4307 template <>
   4308 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
   4309   return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
   4310 }
   4311 
   4312 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); }
   4313 
   4314 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); }
   4315 
   4316 template <>
   4317 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
   4318   EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
   4319 }
   4320 
   4321 template <>
   4322 EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
   4323   EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
   4324 }
   4325 
   4326 template <>
   4327 EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
   4328   EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
   4329 }
   4330 
   4331 template <>
   4332 EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
   4333   EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
   4334 }
   4335 
   4336 template <>
   4337 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
   4338   Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
   4339   res = vsetq_lane_f16(from[0 * stride].x, res, 0);
   4340   res = vsetq_lane_f16(from[1 * stride].x, res, 1);
   4341   res = vsetq_lane_f16(from[2 * stride].x, res, 2);
   4342   res = vsetq_lane_f16(from[3 * stride].x, res, 3);
   4343   res = vsetq_lane_f16(from[4 * stride].x, res, 4);
   4344   res = vsetq_lane_f16(from[5 * stride].x, res, 5);
   4345   res = vsetq_lane_f16(from[6 * stride].x, res, 6);
   4346   res = vsetq_lane_f16(from[7 * stride].x, res, 7);
   4347   return res;
   4348 }
   4349 
   4350 template <>
   4351 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
   4352   Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
   4353   res = vset_lane_f16(from[0 * stride].x, res, 0);
   4354   res = vset_lane_f16(from[1 * stride].x, res, 1);
   4355   res = vset_lane_f16(from[2 * stride].x, res, 2);
   4356   res = vset_lane_f16(from[3 * stride].x, res, 3);
   4357   return res;
   4358 }
   4359 
   4360 template <>
   4361 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from, Index stride) {
   4362   to[stride * 0].x = vgetq_lane_f16(from, 0);
   4363   to[stride * 1].x = vgetq_lane_f16(from, 1);
   4364   to[stride * 2].x = vgetq_lane_f16(from, 2);
   4365   to[stride * 3].x = vgetq_lane_f16(from, 3);
   4366   to[stride * 4].x = vgetq_lane_f16(from, 4);
   4367   to[stride * 5].x = vgetq_lane_f16(from, 5);
   4368   to[stride * 6].x = vgetq_lane_f16(from, 6);
   4369   to[stride * 7].x = vgetq_lane_f16(from, 7);
   4370 }
   4371 
   4372 template <>
   4373 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from, Index stride) {
   4374   to[stride * 0].x = vget_lane_f16(from, 0);
   4375   to[stride * 1].x = vget_lane_f16(from, 1);
   4376   to[stride * 2].x = vget_lane_f16(from, 2);
   4377   to[stride * 3].x = vget_lane_f16(from, 3);
   4378 }
   4379 
   4380 template <>
   4381 EIGEN_STRONG_INLINE void prefetch<Eigen::half>(const Eigen::half* addr) {
   4382   EIGEN_ARM_PREFETCH(addr);
   4383 }
   4384 
   4385 template <>
   4386 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(const Packet8hf& a) {
   4387   float16_t x[8];
   4388   vst1q_f16(x, a);
   4389   Eigen::half h;
   4390   h.x = x[0];
   4391   return h;
   4392 }
   4393 
   4394 template <>
   4395 EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(const Packet4hf& a) {
   4396   float16_t x[4];
   4397   vst1_f16(x, a);
   4398   Eigen::half h;
   4399   h.x = x[0];
   4400   return h;
   4401 }
   4402 
   4403 template<> EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
   4404   float16x4_t a_lo, a_hi;
   4405   Packet8hf a_r64;
   4406 
   4407   a_r64 = vrev64q_f16(a);
   4408   a_lo = vget_low_f16(a_r64);
   4409   a_hi = vget_high_f16(a_r64);
   4410   return vcombine_f16(a_hi, a_lo);
   4411 }
   4412 
   4413 template <>
   4414 EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(const Packet4hf& a) {
   4415   return vrev64_f16(a);
   4416 }
   4417 
   4418 template <>
   4419 EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(const Packet8hf& a) {
   4420   return vabsq_f16(a);
   4421 }
   4422 
   4423 template <>
   4424 EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
   4425   return vabs_f16(a);
   4426 }
   4427 
   4428 template <>
   4429 EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
   4430   float16x4_t a_lo, a_hi, sum;
   4431 
   4432   a_lo = vget_low_f16(a);
   4433   a_hi = vget_high_f16(a);
   4434   sum = vpadd_f16(a_lo, a_hi);
   4435   sum = vpadd_f16(sum, sum);
   4436   sum = vpadd_f16(sum, sum);
   4437 
   4438   Eigen::half h;
   4439   h.x = vget_lane_f16(sum, 0);
   4440   return h;
   4441 }
   4442 
   4443 template <>
   4444 EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(const Packet4hf& a) {
   4445   float16x4_t sum;
   4446 
   4447   sum = vpadd_f16(a, a);
   4448   sum = vpadd_f16(sum, sum);
   4449   Eigen::half h;
   4450   h.x = vget_lane_f16(sum, 0);
   4451   return h;
   4452 }
   4453 
   4454 template <>
   4455 EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(const Packet8hf& a) {
   4456   float16x4_t a_lo, a_hi, prod;
   4457 
   4458   a_lo = vget_low_f16(a);
   4459   a_hi = vget_high_f16(a);
   4460   prod = vmul_f16(a_lo, a_hi);
   4461   prod = vmul_f16(prod, vrev64_f16(prod));
   4462 
   4463   Eigen::half h;
   4464   h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
   4465   return h;
   4466 }
   4467 
   4468 template <>
   4469 EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(const Packet4hf& a) {
   4470   float16x4_t prod;
   4471   prod = vmul_f16(a, vrev64_f16(a));
   4472   Eigen::half h;
   4473   h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
   4474   return h;
   4475 }
   4476 
   4477 template <>
   4478 EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(const Packet8hf& a) {
   4479   float16x4_t a_lo, a_hi, min;
   4480 
   4481   a_lo = vget_low_f16(a);
   4482   a_hi = vget_high_f16(a);
   4483   min = vpmin_f16(a_lo, a_hi);
   4484   min = vpmin_f16(min, min);
   4485   min = vpmin_f16(min, min);
   4486 
   4487   Eigen::half h;
   4488   h.x = vget_lane_f16(min, 0);
   4489   return h;
   4490 }
   4491 
   4492 template <>
   4493 EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(const Packet4hf& a) {
   4494   Packet4hf tmp;
   4495   tmp = vpmin_f16(a, a);
   4496   tmp = vpmin_f16(tmp, tmp);
   4497   Eigen::half h;
   4498   h.x = vget_lane_f16(tmp, 0);
   4499   return h;
   4500 }
   4501 
   4502 template <>
   4503 EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(const Packet8hf& a) {
   4504   float16x4_t a_lo, a_hi, max;
   4505 
   4506   a_lo = vget_low_f16(a);
   4507   a_hi = vget_high_f16(a);
   4508   max = vpmax_f16(a_lo, a_hi);
   4509   max = vpmax_f16(max, max);
   4510   max = vpmax_f16(max, max);
   4511 
   4512   Eigen::half h;
   4513   h.x = vget_lane_f16(max, 0);
   4514   return h;
   4515 }
   4516 
   4517 template <>
   4518 EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
   4519   Packet4hf tmp;
   4520   tmp = vpmax_f16(a, a);
   4521   tmp = vpmax_f16(tmp, tmp);
   4522   Eigen::half h;
   4523   h.x = vget_lane_f16(tmp, 0);
   4524   return h;
   4525 }
   4526 
   4527 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
   4528 {
   4529   const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
   4530   const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
   4531 
   4532   const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
   4533   const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
   4534 
   4535   kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
   4536   kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
   4537   kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
   4538   kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
   4539 }
   4540 
   4541 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
   4542   EIGEN_ALIGN16 float16x4x4_t tmp_x4;
   4543   float16_t* tmp = (float16_t*)&kernel;
   4544   tmp_x4 = vld4_f16(tmp);
   4545 
   4546   kernel.packet[0] = tmp_x4.val[0];
   4547   kernel.packet[1] = tmp_x4.val[1];
   4548   kernel.packet[2] = tmp_x4.val[2];
   4549   kernel.packet[3] = tmp_x4.val[3];
   4550 }
   4551 
   4552 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
   4553   float16x8x2_t T_1[4];
   4554 
   4555   T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
   4556   T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
   4557   T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
   4558   T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
   4559 
   4560   float16x8x2_t T_2[4];
   4561   T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
   4562   T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
   4563   T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
   4564   T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
   4565 
   4566   float16x8x2_t T_3[4];
   4567   T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
   4568   T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
   4569   T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
   4570   T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
   4571 
   4572   kernel.packet[0] = T_3[0].val[0];
   4573   kernel.packet[1] = T_3[2].val[0];
   4574   kernel.packet[2] = T_3[1].val[0];
   4575   kernel.packet[3] = T_3[3].val[0];
   4576   kernel.packet[4] = T_3[0].val[1];
   4577   kernel.packet[5] = T_3[2].val[1];
   4578   kernel.packet[6] = T_3[1].val[1];
   4579   kernel.packet[7] = T_3[3].val[1];
   4580 }
   4581 #endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
   4582 
   4583 } // end namespace internal
   4584 
   4585 } // end namespace Eigen
   4586 
   4587 #endif // EIGEN_PACKET_MATH_NEON_H