cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

GeneralBlockPanelKernel.h (6815B)


      1 namespace Eigen {
      2 namespace internal {
      3   
      4 #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
      5 
      6 // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
      7 // Here we specialize gebp_traits to eliminate these register spills.
      8 // See #2138.
      9 template<>
     10 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
     11  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
     12 {
     13   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
     14   { 
     15     // This volatile inline ASM both acts as a barrier to prevent reordering,
     16     // as well as enforces strict register use.
     17     asm volatile(
     18       "vmla.f32 %q[r], %q[c], %q[alpha]"
     19       : [r] "+w" (r)
     20       : [c] "w" (c),
     21         [alpha] "w" (alpha)
     22       : );
     23   }
     24 
     25   template <typename LaneIdType>
     26   EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
     27                                 Packet4f& c, Packet4f& tmp,
     28                                 const LaneIdType&) const {
     29     acc(a, b, c);
     30   }
     31   
     32   template <typename LaneIdType>
     33   EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
     34                                 Packet4f& c, Packet4f& tmp,
     35                                 const LaneIdType& lane) const {
     36     madd(a, b.get(lane), c, tmp, lane);
     37   }
     38 };
     39 
     40 #endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
     41 
     42 #if EIGEN_ARCH_ARM64
     43 
     44 template<>
     45 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
     46  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
     47 {
     48   typedef float RhsPacket;
     49   typedef float32x4_t RhsPacketx4;
     50 
     51   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
     52   {
     53     dest = *b;
     54   }
     55 
     56   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
     57   {
     58     dest = vld1q_f32(b);
     59   }
     60 
     61   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
     62   {
     63     dest = *b;
     64   }
     65 
     66   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
     67   {}
     68 
     69   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
     70   {
     71     loadRhs(b,dest);
     72   }
     73 
     74   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
     75   {
     76     c = vfmaq_n_f32(c, a, b);
     77   }
     78 
     79   // NOTE: Template parameter inference failed when compiled with Android NDK:
     80   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
     81 
     82   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
     83   { madd_helper<0>(a, b, c); }
     84   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
     85   { madd_helper<1>(a, b, c); }
     86   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
     87   { madd_helper<2>(a, b, c); }
     88   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
     89   { madd_helper<3>(a, b, c); }
     90 
     91  private:
     92   template<int LaneID>
     93   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
     94   {
     95     #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
     96     // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
     97     // vfmaq_laneq_f32 is implemented through a costly dup
     98          if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
     99     else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
    100     else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
    101     else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
    102     #else
    103     c = vfmaq_laneq_f32(c, a, b, LaneID);
    104     #endif
    105   }
    106 };
    107 
    108 
    109 template<>
    110 struct gebp_traits <double,double,false,false,Architecture::NEON>
    111  : gebp_traits<double,double,false,false,Architecture::Generic>
    112 {
    113   typedef double RhsPacket;
    114 
    115   struct RhsPacketx4 {
    116     float64x2_t B_0, B_1;
    117   };
    118 
    119   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
    120   {
    121     dest = *b;
    122   }
    123 
    124   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
    125   {
    126     dest.B_0 = vld1q_f64(b);
    127     dest.B_1 = vld1q_f64(b+2);
    128   }
    129 
    130   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
    131   {
    132     loadRhs(b,dest);
    133   }
    134 
    135   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
    136   {}
    137 
    138   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
    139   {
    140     loadRhs(b,dest);
    141   }
    142 
    143   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
    144   {
    145     c = vfmaq_n_f64(c, a, b);
    146   }
    147 
    148   // NOTE: Template parameter inference failed when compiled with Android NDK:
    149   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
    150 
    151   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
    152   { madd_helper<0>(a, b, c); }
    153   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
    154   { madd_helper<1>(a, b, c); }
    155   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
    156   { madd_helper<2>(a, b, c); }
    157   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
    158   { madd_helper<3>(a, b, c); }
    159 
    160  private:
    161   template <int LaneID>
    162   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
    163   {
    164     #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
    165     // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
    166     // vfmaq_laneq_f64 is implemented through a costly dup
    167          if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
    168     else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
    169     else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
    170     else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
    171     #else
    172          if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
    173     else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
    174     else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
    175     else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
    176     #endif
    177   }
    178 };
    179 
    180 #endif // EIGEN_ARCH_ARM64
    181 
    182 }  // namespace internal
    183 }  // namespace Eigen