cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

PacketMath.h (33615B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2018 Wave Computing, Inc.
      5 // Written by:
      6 //   Chris Larsen
      7 //   Alexey Frunze (afrunze@wavecomp.com)
      8 //
      9 // This Source Code Form is subject to the terms of the Mozilla
     10 // Public License v. 2.0. If a copy of the MPL was not distributed
     11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     12 
     13 #ifndef EIGEN_PACKET_MATH_MSA_H
     14 #define EIGEN_PACKET_MATH_MSA_H
     15 
     16 #include <iostream>
     17 #include <string>
     18 
     19 namespace Eigen {
     20 
     21 namespace internal {
     22 
     23 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
     24 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
     25 #endif
     26 
     27 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     28 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     29 #endif
     30 
     31 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
     32 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
     33 #endif
     34 
     35 #if 0
     36 #define EIGEN_MSA_DEBUG                                                             \
     37   static bool firstTime = true;                                                     \
     38   do {                                                                              \
     39     if (firstTime) {                                                                \
     40       std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
     41       firstTime = false;                                                            \
     42     }                                                                               \
     43   } while (0)
     44 #else
     45 #define EIGEN_MSA_DEBUG
     46 #endif
     47 
     48 #define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
     49 
     50 typedef v4f32 Packet4f;
     51 typedef v4i32 Packet4i;
     52 typedef v4u32 Packet4ui;
     53 
     54 #define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
     55 #define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
     56 #define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
     57 
     58 inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
     59   os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
     60   return os;
     61 }
     62 
     63 inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) {
     64   os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
     65   return os;
     66 }
     67 
     68 inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) {
     69   os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
     70   return os;
     71 }
     72 
     73 template <>
     74 struct packet_traits<float> : default_packet_traits {
     75   typedef Packet4f type;
     76   typedef Packet4f half;  // Packet2f intrinsics not implemented yet
     77   enum {
     78     Vectorizable = 1,
     79     AlignedOnScalar = 1,
     80     size = 4,
     81     HasHalfPacket = 0,  // Packet2f intrinsics not implemented yet
     82     // FIXME check the Has*
     83     HasDiv = 1,
     84     HasSin = EIGEN_FAST_MATH,
     85     HasCos = EIGEN_FAST_MATH,
     86     HasTanh = EIGEN_FAST_MATH,
     87     HasErf = EIGEN_FAST_MATH,
     88     HasLog = 1,
     89     HasExp = 1,
     90     HasSqrt = 1,
     91     HasRsqrt = 1,
     92     HasRound = 1,
     93     HasFloor = 1,
     94     HasCeil = 1,
     95     HasBlend = 1
     96   };
     97 };
     98 
     99 template <>
    100 struct packet_traits<int32_t> : default_packet_traits {
    101   typedef Packet4i type;
    102   typedef Packet4i half;  // Packet2i intrinsics not implemented yet
    103   enum {
    104     Vectorizable = 1,
    105     AlignedOnScalar = 1,
    106     size = 4,
    107     HasHalfPacket = 0,  // Packet2i intrinsics not implemented yet
    108     // FIXME check the Has*
    109     HasDiv = 1,
    110     HasBlend = 1
    111   };
    112 };
    113 
    114 template <>
    115 struct unpacket_traits<Packet4f> {
    116   typedef float type;
    117   enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
    118   typedef Packet4f half;
    119 };
    120 
    121 template <>
    122 struct unpacket_traits<Packet4i> {
    123   typedef int32_t type;
    124   enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
    125   typedef Packet4i half;
    126 };
    127 
    128 template <>
    129 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
    130   EIGEN_MSA_DEBUG;
    131 
    132   Packet4f v = { from, from, from, from };
    133   return v;
    134 }
    135 
    136 template <>
    137 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
    138   EIGEN_MSA_DEBUG;
    139 
    140   return __builtin_msa_fill_w(from);
    141 }
    142 
    143 template <>
    144 EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
    145   EIGEN_MSA_DEBUG;
    146 
    147   float f = *from;
    148   Packet4f v = { f, f, f, f };
    149   return v;
    150 }
    151 
    152 template <>
    153 EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) {
    154   EIGEN_MSA_DEBUG;
    155 
    156   return __builtin_msa_fill_w(*from);
    157 }
    158 
    159 template <>
    160 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
    161   EIGEN_MSA_DEBUG;
    162 
    163   return __builtin_msa_fadd_w(a, b);
    164 }
    165 
    166 template <>
    167 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
    168   EIGEN_MSA_DEBUG;
    169 
    170   return __builtin_msa_addv_w(a, b);
    171 }
    172 
    173 template <>
    174 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
    175   EIGEN_MSA_DEBUG;
    176 
    177   static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
    178   return padd(pset1<Packet4f>(a), countdown);
    179 }
    180 
    181 template <>
    182 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
    183   EIGEN_MSA_DEBUG;
    184 
    185   static const Packet4i countdown = { 0, 1, 2, 3 };
    186   return padd(pset1<Packet4i>(a), countdown);
    187 }
    188 
    189 template <>
    190 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
    191   EIGEN_MSA_DEBUG;
    192 
    193   return __builtin_msa_fsub_w(a, b);
    194 }
    195 
    196 template <>
    197 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
    198   EIGEN_MSA_DEBUG;
    199 
    200   return __builtin_msa_subv_w(a, b);
    201 }
    202 
    203 template <>
    204 EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
    205   EIGEN_MSA_DEBUG;
    206 
    207   return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
    208 }
    209 
    210 template <>
    211 EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
    212   EIGEN_MSA_DEBUG;
    213 
    214   return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
    215 }
    216 
    217 template <>
    218 EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
    219   EIGEN_MSA_DEBUG;
    220 
    221   return a;
    222 }
    223 
    224 template <>
    225 EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
    226   EIGEN_MSA_DEBUG;
    227 
    228   return a;
    229 }
    230 
    231 template <>
    232 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
    233   EIGEN_MSA_DEBUG;
    234 
    235   return __builtin_msa_fmul_w(a, b);
    236 }
    237 
    238 template <>
    239 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
    240   EIGEN_MSA_DEBUG;
    241 
    242   return __builtin_msa_mulv_w(a, b);
    243 }
    244 
    245 template <>
    246 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
    247   EIGEN_MSA_DEBUG;
    248 
    249   return __builtin_msa_fdiv_w(a, b);
    250 }
    251 
    252 template <>
    253 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
    254   EIGEN_MSA_DEBUG;
    255 
    256   return __builtin_msa_div_s_w(a, b);
    257 }
    258 
    259 template <>
    260 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
    261   EIGEN_MSA_DEBUG;
    262 
    263   return __builtin_msa_fmadd_w(c, a, b);
    264 }
    265 
    266 template <>
    267 EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
    268   EIGEN_MSA_DEBUG;
    269 
    270   // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug.
    271   Packet4i value = c;
    272   __asm__("maddv.w %w[value], %w[a], %w[b]\n"
    273           // Outputs
    274           : [value] "+f"(value)
    275           // Inputs
    276           : [a] "f"(a), [b] "f"(b));
    277   return value;
    278 }
    279 
    280 template <>
    281 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
    282   EIGEN_MSA_DEBUG;
    283 
    284   return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
    285 }
    286 
    287 template <>
    288 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
    289   EIGEN_MSA_DEBUG;
    290 
    291   return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
    292 }
    293 
    294 template <>
    295 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
    296   EIGEN_MSA_DEBUG;
    297 
    298   return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
    299 }
    300 
    301 template <>
    302 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
    303   EIGEN_MSA_DEBUG;
    304 
    305   return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
    306 }
    307 
    308 template <>
    309 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
    310   EIGEN_MSA_DEBUG;
    311 
    312   return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
    313 }
    314 
    315 template <>
    316 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
    317   EIGEN_MSA_DEBUG;
    318 
    319   return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
    320 }
    321 
    322 template <>
    323 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
    324   EIGEN_MSA_DEBUG;
    325 
    326   return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
    327 }
    328 
    329 template <>
    330 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
    331   EIGEN_MSA_DEBUG;
    332 
    333   return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
    334 }
    335 
    336 template <>
    337 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
    338   EIGEN_MSA_DEBUG;
    339 
    340 #if EIGEN_FAST_MATH
    341   // This prefers numbers to NaNs.
    342   return __builtin_msa_fmin_w(a, b);
    343 #else
    344   // This prefers NaNs to numbers.
    345   Packet4i aNaN = __builtin_msa_fcun_w(a, a);
    346   Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
    347   return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
    348 #endif
    349 }
    350 
    351 template <>
    352 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
    353   EIGEN_MSA_DEBUG;
    354 
    355   return __builtin_msa_min_s_w(a, b);
    356 }
    357 
    358 template <>
    359 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
    360   EIGEN_MSA_DEBUG;
    361 
    362 #if EIGEN_FAST_MATH
    363   // This prefers numbers to NaNs.
    364   return __builtin_msa_fmax_w(a, b);
    365 #else
    366   // This prefers NaNs to numbers.
    367   Packet4i aNaN = __builtin_msa_fcun_w(a, a);
    368   Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
    369   return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
    370 #endif
    371 }
    372 
    373 template <>
    374 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
    375   EIGEN_MSA_DEBUG;
    376 
    377   return __builtin_msa_max_s_w(a, b);
    378 }
    379 
    380 template <>
    381 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
    382   EIGEN_MSA_DEBUG;
    383 
    384   EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
    385 }
    386 
    387 template <>
    388 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
    389   EIGEN_MSA_DEBUG;
    390 
    391   EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
    392 }
    393 
    394 template <>
    395 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
    396   EIGEN_MSA_DEBUG;
    397 
    398   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
    399 }
    400 
    401 template <>
    402 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
    403   EIGEN_MSA_DEBUG;
    404 
    405   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
    406 }
    407 
    408 template <>
    409 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
    410   EIGEN_MSA_DEBUG;
    411 
    412   float f0 = from[0], f1 = from[1];
    413   Packet4f v0 = { f0, f0, f0, f0 };
    414   Packet4f v1 = { f1, f1, f1, f1 };
    415   return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
    416 }
    417 
    418 template <>
    419 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
    420   EIGEN_MSA_DEBUG;
    421 
    422   int32_t i0 = from[0], i1 = from[1];
    423   Packet4i v0 = { i0, i0, i0, i0 };
    424   Packet4i v1 = { i1, i1, i1, i1 };
    425   return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
    426 }
    427 
    428 template <>
    429 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
    430   EIGEN_MSA_DEBUG;
    431 
    432   EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
    433 }
    434 
    435 template <>
    436 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
    437   EIGEN_MSA_DEBUG;
    438 
    439   EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
    440 }
    441 
    442 template <>
    443 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
    444   EIGEN_MSA_DEBUG;
    445 
    446   EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
    447 }
    448 
    449 template <>
    450 EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
    451   EIGEN_MSA_DEBUG;
    452 
    453   EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
    454 }
    455 
    456 template <>
    457 EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
    458   EIGEN_MSA_DEBUG;
    459 
    460   float f = *from;
    461   Packet4f v = { f, f, f, f };
    462   v[1] = from[stride];
    463   v[2] = from[2 * stride];
    464   v[3] = from[3 * stride];
    465   return v;
    466 }
    467 
    468 template <>
    469 EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
    470   EIGEN_MSA_DEBUG;
    471 
    472   int32_t i = *from;
    473   Packet4i v = { i, i, i, i };
    474   v[1] = from[stride];
    475   v[2] = from[2 * stride];
    476   v[3] = from[3 * stride];
    477   return v;
    478 }
    479 
    480 template <>
    481 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from,
    482                                                         Index stride) {
    483   EIGEN_MSA_DEBUG;
    484 
    485   *to = from[0];
    486   to += stride;
    487   *to = from[1];
    488   to += stride;
    489   *to = from[2];
    490   to += stride;
    491   *to = from[3];
    492 }
    493 
    494 template <>
    495 EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
    496                                                           Index stride) {
    497   EIGEN_MSA_DEBUG;
    498 
    499   *to = from[0];
    500   to += stride;
    501   *to = from[1];
    502   to += stride;
    503   *to = from[2];
    504   to += stride;
    505   *to = from[3];
    506 }
    507 
    508 template <>
    509 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
    510   EIGEN_MSA_DEBUG;
    511 
    512   __builtin_prefetch(addr);
    513 }
    514 
    515 template <>
    516 EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
    517   EIGEN_MSA_DEBUG;
    518 
    519   __builtin_prefetch(addr);
    520 }
    521 
    522 template <>
    523 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
    524   EIGEN_MSA_DEBUG;
    525 
    526   return a[0];
    527 }
    528 
    529 template <>
    530 EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
    531   EIGEN_MSA_DEBUG;
    532 
    533   return a[0];
    534 }
    535 
    536 template <>
    537 EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
    538   EIGEN_MSA_DEBUG;
    539 
    540   return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
    541 }
    542 
    543 template <>
    544 EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
    545   EIGEN_MSA_DEBUG;
    546 
    547   return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
    548 }
    549 
    550 template <>
    551 EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
    552   EIGEN_MSA_DEBUG;
    553 
    554   return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
    555 }
    556 
    557 template <>
    558 EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
    559   EIGEN_MSA_DEBUG;
    560 
    561   Packet4i zero = __builtin_msa_ldi_w(0);
    562   return __builtin_msa_add_a_w(zero, a);
    563 }
    564 
    565 template <>
    566 EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
    567   EIGEN_MSA_DEBUG;
    568 
    569   Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
    570   s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
    571   return s[0];
    572 }
    573 
    574 
    575 template <>
    576 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
    577   EIGEN_MSA_DEBUG;
    578 
    579   Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
    580   s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
    581   return s[0];
    582 }
    583 
    584 // Other reduction functions:
    585 // mul
    586 template <>
    587 EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
    588   EIGEN_MSA_DEBUG;
    589 
    590   Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
    591   p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
    592   return p[0];
    593 }
    594 
    595 template <>
    596 EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
    597   EIGEN_MSA_DEBUG;
    598 
    599   Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
    600   p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
    601   return p[0];
    602 }
    603 
    604 // min
    605 template <>
    606 EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
    607   EIGEN_MSA_DEBUG;
    608 
    609   // Swap 64-bit halves of a.
    610   Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
    611 #if !EIGEN_FAST_MATH
    612   // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
    613   // masks of all zeroes/ones in low 64 bits.
    614   v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
    615   // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
    616   unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
    617 #endif
    618   // Continue with min computation.
    619   Packet4f v = __builtin_msa_fmin_w(a, swapped);
    620   v = __builtin_msa_fmin_w(
    621       v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
    622 #if !EIGEN_FAST_MATH
    623   // Based on the mask select between v and 4 qNaNs.
    624   v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
    625   v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
    626 #endif
    627   return v[0];
    628 }
    629 
    630 template <>
    631 EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
    632   EIGEN_MSA_DEBUG;
    633 
    634   Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
    635   m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
    636   return m[0];
    637 }
    638 
    639 // max
    640 template <>
    641 EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
    642   EIGEN_MSA_DEBUG;
    643 
    644   // Swap 64-bit halves of a.
    645   Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
    646 #if !EIGEN_FAST_MATH
    647   // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
    648   // masks of all zeroes/ones in low 64 bits.
    649   v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
    650   // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
    651   unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
    652 #endif
    653   // Continue with max computation.
    654   Packet4f v = __builtin_msa_fmax_w(a, swapped);
    655   v = __builtin_msa_fmax_w(
    656       v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
    657 #if !EIGEN_FAST_MATH
    658   // Based on the mask select between v and 4 qNaNs.
    659   v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
    660   v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
    661 #endif
    662   return v[0];
    663 }
    664 
    665 template <>
    666 EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
    667   EIGEN_MSA_DEBUG;
    668 
    669   Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
    670   m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
    671   return m[0];
    672 }
    673 
    674 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
    675   os << "[ " << value.packet[0] << "," << std::endl
    676      << "  " << value.packet[1] << "," << std::endl
    677      << "  " << value.packet[2] << "," << std::endl
    678      << "  " << value.packet[3] << " ]";
    679   return os;
    680 }
    681 
    682 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
    683   EIGEN_MSA_DEBUG;
    684 
    685   v4i32 tmp1, tmp2, tmp3, tmp4;
    686 
    687   tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
    688   tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
    689   tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
    690   tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
    691 
    692   kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
    693   kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
    694   kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
    695   kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
    696 }
    697 
    698 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
    699   os << "[ " << value.packet[0] << "," << std::endl
    700      << "  " << value.packet[1] << "," << std::endl
    701      << "  " << value.packet[2] << "," << std::endl
    702      << "  " << value.packet[3] << " ]";
    703   return os;
    704 }
    705 
    706 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
    707   EIGEN_MSA_DEBUG;
    708 
    709   v4i32 tmp1, tmp2, tmp3, tmp4;
    710 
    711   tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
    712   tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
    713   tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
    714   tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
    715 
    716   kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
    717   kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
    718   kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
    719   kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
    720 }
    721 
    722 template <>
    723 EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
    724   EIGEN_MSA_DEBUG;
    725 
    726   return __builtin_msa_fsqrt_w(a);
    727 }
    728 
    729 template <>
    730 EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
    731   EIGEN_MSA_DEBUG;
    732 
    733 #if EIGEN_FAST_MATH
    734   return __builtin_msa_frsqrt_w(a);
    735 #else
    736   Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
    737   return pdiv(ones, psqrt(a));
    738 #endif
    739 }
    740 
    741 template <>
    742 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
    743   Packet4f v = a;
    744   int32_t old_mode, new_mode;
    745   asm volatile(
    746       "cfcmsa  %[old_mode], $1\n"
    747       "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
    748       "ctcmsa  $1, %[new_mode]\n"
    749       "frint.w %w[v], %w[v]\n"
    750       "ctcmsa  $1, %[old_mode]\n"
    751       :  // outputs
    752       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
    753       [v] "+f"(v)
    754       :  // inputs
    755       :  // clobbers
    756   );
    757   return v;
    758 }
    759 
    760 template <>
    761 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
    762   Packet4f v = a;
    763   int32_t old_mode, new_mode;
    764   asm volatile(
    765       "cfcmsa  %[old_mode], $1\n"
    766       "ori     %[new_mode], %[old_mode], 3\n"
    767       "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
    768       "ctcmsa  $1, %[new_mode]\n"
    769       "frint.w %w[v], %w[v]\n"
    770       "ctcmsa  $1, %[old_mode]\n"
    771       :  // outputs
    772       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
    773       [v] "+f"(v)
    774       :  // inputs
    775       :  // clobbers
    776   );
    777   return v;
    778 }
    779 
    780 template <>
    781 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
    782   Packet4f v = a;
    783   int32_t old_mode, new_mode;
    784   asm volatile(
    785       "cfcmsa  %[old_mode], $1\n"
    786       "ori     %[new_mode], %[old_mode], 3\n"
    787       "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
    788       "ctcmsa  $1, %[new_mode]\n"
    789       "frint.w %w[v], %w[v]\n"
    790       "ctcmsa  $1, %[old_mode]\n"
    791       :  // outputs
    792       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
    793       [v] "+f"(v)
    794       :  // inputs
    795       :  // clobbers
    796   );
    797   return v;
    798 }
    799 
    800 template <>
    801 EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
    802                                     const Packet4f& elsePacket) {
    803   Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
    804                        ifPacket.select[3] };
    805   Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
    806   return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
    807 }
    808 
    809 template <>
    810 EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
    811                                     const Packet4i& elsePacket) {
    812   Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
    813                        ifPacket.select[3] };
    814   Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
    815   return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
    816 }
    817 
    818 //---------- double ----------
    819 
    820 typedef v2f64 Packet2d;
    821 typedef v2i64 Packet2l;
    822 typedef v2u64 Packet2ul;
    823 
    824 #define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
    825 #define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
    826 #define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
    827 
    828 inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
    829   os << "[ " << value[0] << ", " << value[1] << " ]";
    830   return os;
    831 }
    832 
    833 inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) {
    834   os << "[ " << value[0] << ", " << value[1] << " ]";
    835   return os;
    836 }
    837 
    838 inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) {
    839   os << "[ " << value[0] << ", " << value[1] << " ]";
    840   return os;
    841 }
    842 
    843 template <>
    844 struct packet_traits<double> : default_packet_traits {
    845   typedef Packet2d type;
    846   typedef Packet2d half;
    847   enum {
    848     Vectorizable = 1,
    849     AlignedOnScalar = 1,
    850     size = 2,
    851     HasHalfPacket = 0,
    852     // FIXME check the Has*
    853     HasDiv = 1,
    854     HasExp = 1,
    855     HasSqrt = 1,
    856     HasRsqrt = 1,
    857     HasRound = 1,
    858     HasFloor = 1,
    859     HasCeil = 1,
    860     HasBlend = 1
    861   };
    862 };
    863 
    864 template <>
    865 struct unpacket_traits<Packet2d> {
    866   typedef double type;
    867   enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
    868   typedef Packet2d half;
    869 };
    870 
    871 template <>
    872 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
    873   EIGEN_MSA_DEBUG;
    874 
    875   Packet2d value = { from, from };
    876   return value;
    877 }
    878 
    879 template <>
    880 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
    881   EIGEN_MSA_DEBUG;
    882 
    883   return __builtin_msa_fadd_d(a, b);
    884 }
    885 
    886 template <>
    887 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
    888   EIGEN_MSA_DEBUG;
    889 
    890   static const Packet2d countdown = { 0.0, 1.0 };
    891   return padd(pset1<Packet2d>(a), countdown);
    892 }
    893 
    894 template <>
    895 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
    896   EIGEN_MSA_DEBUG;
    897 
    898   return __builtin_msa_fsub_d(a, b);
    899 }
    900 
    901 template <>
    902 EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
    903   EIGEN_MSA_DEBUG;
    904 
    905   return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
    906 }
    907 
    908 template <>
    909 EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
    910   EIGEN_MSA_DEBUG;
    911 
    912   return a;
    913 }
    914 
    915 template <>
    916 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
    917   EIGEN_MSA_DEBUG;
    918 
    919   return __builtin_msa_fmul_d(a, b);
    920 }
    921 
    922 template <>
    923 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
    924   EIGEN_MSA_DEBUG;
    925 
    926   return __builtin_msa_fdiv_d(a, b);
    927 }
    928 
    929 template <>
    930 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
    931   EIGEN_MSA_DEBUG;
    932 
    933   return __builtin_msa_fmadd_d(c, a, b);
    934 }
    935 
    936 // Logical Operations are not supported for float, so we have to reinterpret casts using MSA
    937 // intrinsics
    938 template <>
    939 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
    940   EIGEN_MSA_DEBUG;
    941 
    942   return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
    943 }
    944 
    945 template <>
    946 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
    947   EIGEN_MSA_DEBUG;
    948 
    949   return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
    950 }
    951 
    952 template <>
    953 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
    954   EIGEN_MSA_DEBUG;
    955 
    956   return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
    957 }
    958 
    959 template <>
    960 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
    961   EIGEN_MSA_DEBUG;
    962 
    963   return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
    964 }
    965 
    966 template <>
    967 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
    968   EIGEN_MSA_DEBUG;
    969 
    970   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
    971 }
    972 
    973 template <>
    974 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
    975   EIGEN_MSA_DEBUG;
    976 
    977 #if EIGEN_FAST_MATH
    978   // This prefers numbers to NaNs.
    979   return __builtin_msa_fmin_d(a, b);
    980 #else
    981   // This prefers NaNs to numbers.
    982   v2i64 aNaN = __builtin_msa_fcun_d(a, a);
    983   v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
    984   return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
    985 #endif
    986 }
    987 
    988 template <>
    989 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
    990   EIGEN_MSA_DEBUG;
    991 
    992 #if EIGEN_FAST_MATH
    993   // This prefers numbers to NaNs.
    994   return __builtin_msa_fmax_d(a, b);
    995 #else
    996   // This prefers NaNs to numbers.
    997   v2i64 aNaN = __builtin_msa_fcun_d(a, a);
    998   v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
    999   return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
   1000 #endif
   1001 }
   1002 
   1003 template <>
   1004 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
   1005   EIGEN_MSA_DEBUG;
   1006 
   1007   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
   1008 }
   1009 
   1010 template <>
   1011 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
   1012   EIGEN_MSA_DEBUG;
   1013 
   1014   Packet2d value = { *from, *from };
   1015   return value;
   1016 }
   1017 
   1018 template <>
   1019 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
   1020   EIGEN_MSA_DEBUG;
   1021 
   1022   EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
   1023 }
   1024 
   1025 template <>
   1026 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
   1027   EIGEN_MSA_DEBUG;
   1028 
   1029   EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
   1030 }
   1031 
   1032 template <>
   1033 EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
   1034   EIGEN_MSA_DEBUG;
   1035 
   1036   Packet2d value;
   1037   value[0] = *from;
   1038   from += stride;
   1039   value[1] = *from;
   1040   return value;
   1041 }
   1042 
   1043 template <>
   1044 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from,
   1045                                                          Index stride) {
   1046   EIGEN_MSA_DEBUG;
   1047 
   1048   *to = from[0];
   1049   to += stride;
   1050   *to = from[1];
   1051 }
   1052 
   1053 template <>
   1054 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
   1055   EIGEN_MSA_DEBUG;
   1056 
   1057   __builtin_prefetch(addr);
   1058 }
   1059 
   1060 template <>
   1061 EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
   1062   EIGEN_MSA_DEBUG;
   1063 
   1064   return a[0];
   1065 }
   1066 
   1067 template <>
   1068 EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
   1069   EIGEN_MSA_DEBUG;
   1070 
   1071   return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
   1072 }
   1073 
   1074 template <>
   1075 EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
   1076   EIGEN_MSA_DEBUG;
   1077 
   1078   return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
   1079 }
   1080 
   1081 template <>
   1082 EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
   1083   EIGEN_MSA_DEBUG;
   1084 
   1085   Packet2d s = padd(a, preverse(a));
   1086   return s[0];
   1087 }
   1088 
   1089 // Other reduction functions:
   1090 // mul
   1091 template <>
   1092 EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
   1093   EIGEN_MSA_DEBUG;
   1094 
   1095   Packet2d p = pmul(a, preverse(a));
   1096   return p[0];
   1097 }
   1098 
   1099 // min
   1100 template <>
   1101 EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
   1102   EIGEN_MSA_DEBUG;
   1103 
   1104 #if EIGEN_FAST_MATH
   1105   Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
   1106   Packet2d v = __builtin_msa_fmin_d(a, swapped);
   1107   return v[0];
   1108 #else
   1109   double a0 = a[0], a1 = a[1];
   1110   return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
   1111 #endif
   1112 }
   1113 
   1114 // max
   1115 template <>
   1116 EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
   1117   EIGEN_MSA_DEBUG;
   1118 
   1119 #if EIGEN_FAST_MATH
   1120   Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
   1121   Packet2d v = __builtin_msa_fmax_d(a, swapped);
   1122   return v[0];
   1123 #else
   1124   double a0 = a[0], a1 = a[1];
   1125   return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
   1126 #endif
   1127 }
   1128 
   1129 template <>
   1130 EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
   1131   EIGEN_MSA_DEBUG;
   1132 
   1133   return __builtin_msa_fsqrt_d(a);
   1134 }
   1135 
   1136 template <>
   1137 EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
   1138   EIGEN_MSA_DEBUG;
   1139 
   1140 #if EIGEN_FAST_MATH
   1141   return __builtin_msa_frsqrt_d(a);
   1142 #else
   1143   Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
   1144   return pdiv(ones, psqrt(a));
   1145 #endif
   1146 }
   1147 
   1148 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
   1149   os << "[ " << value.packet[0] << "," << std::endl << "  " << value.packet[1] << " ]";
   1150   return os;
   1151 }
   1152 
   1153 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
   1154   EIGEN_MSA_DEBUG;
   1155 
   1156   Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
   1157   Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
   1158   kernel.packet[0] = trn1;
   1159   kernel.packet[1] = trn2;
   1160 }
   1161 
   1162 template <>
   1163 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
   1164   Packet2d v = a;
   1165   int32_t old_mode, new_mode;
   1166   asm volatile(
   1167       "cfcmsa  %[old_mode], $1\n"
   1168       "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
   1169       "ctcmsa  $1, %[new_mode]\n"
   1170       "frint.d %w[v], %w[v]\n"
   1171       "ctcmsa  $1, %[old_mode]\n"
   1172       :  // outputs
   1173       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
   1174       [v] "+f"(v)
   1175       :  // inputs
   1176       :  // clobbers
   1177   );
   1178   return v;
   1179 }
   1180 
   1181 template <>
   1182 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
   1183   Packet2d v = a;
   1184   int32_t old_mode, new_mode;
   1185   asm volatile(
   1186       "cfcmsa  %[old_mode], $1\n"
   1187       "ori     %[new_mode], %[old_mode], 3\n"
   1188       "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
   1189       "ctcmsa  $1, %[new_mode]\n"
   1190       "frint.d %w[v], %w[v]\n"
   1191       "ctcmsa  $1, %[old_mode]\n"
   1192       :  // outputs
   1193       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
   1194       [v] "+f"(v)
   1195       :  // inputs
   1196       :  // clobbers
   1197   );
   1198   return v;
   1199 }
   1200 
   1201 template <>
   1202 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
   1203   Packet2d v = a;
   1204   int32_t old_mode, new_mode;
   1205   asm volatile(
   1206       "cfcmsa  %[old_mode], $1\n"
   1207       "ori     %[new_mode], %[old_mode], 3\n"
   1208       "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
   1209       "ctcmsa  $1, %[new_mode]\n"
   1210       "frint.d %w[v], %w[v]\n"
   1211       "ctcmsa  $1, %[old_mode]\n"
   1212       :  // outputs
   1213       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
   1214       [v] "+f"(v)
   1215       :  // inputs
   1216       :  // clobbers
   1217   );
   1218   return v;
   1219 }
   1220 
   1221 template <>
   1222 EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
   1223                                     const Packet2d& elsePacket) {
   1224   Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
   1225   Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
   1226   return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
   1227 }
   1228 
   1229 }  // end namespace internal
   1230 
   1231 }  // end namespace Eigen
   1232 
   1233 #endif  // EIGEN_PACKET_MATH_MSA_H