cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

PartialReduxEvaluator.h (9207B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_PARTIALREDUX_H
     11 #define EIGEN_PARTIALREDUX_H
     12 
     13 namespace Eigen { 
     14 
     15 namespace internal {
     16 
     17 
     18 /***************************************************************************
     19 *
     20 * This file provides evaluators for partial reductions.
     21 * There are two modes:
     22 *
     23 *  - scalar path: simply calls the respective function on the column or row.
     24 *    -> nothing special here, all the tricky part is handled by the return
     25 *       types of VectorwiseOp's members. They embed the functor calling the
     26 *       respective DenseBase's member function.
     27 *
     28 *  - vectorized path: implements a packet-wise reductions followed by
     29 *    some (optional) processing of the outcome, e.g., division by n for mean.
     30 *
     31 * For the vectorized path let's observe that the packet-size and outer-unrolling
     32 * are both decided by the assignement logic. So all we have to do is to decide
     33 * on the inner unrolling.
     34 *
     35 * For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h,
     36 * but be need to be careful to specify correct increment.
     37 *
     38 ***************************************************************************/
     39 
     40 
     41 /* logic deciding a strategy for unrolling of vectorized paths */
     42 template<typename Func, typename Evaluator>
     43 struct packetwise_redux_traits
     44 {
     45   enum {
     46     OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime,
     47     Cost = OuterSize == Dynamic ? HugeCost
     48          : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits<Func>::Cost,
     49     Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling
     50   };
     51 
     52 };
     53 
     54 /* Value to be returned when size==0 , by default let's return 0 */
     55 template<typename PacketType,typename Func>
     56 EIGEN_DEVICE_FUNC
     57 PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }
     58 
     59 /* For products the default is 1 */
     60 template<typename PacketType,typename Scalar>
     61 EIGEN_DEVICE_FUNC
     62 PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }
     63 
     64 /* Perform the actual reduction */
     65 template<typename Func, typename Evaluator,
     66          int Unrolling = packetwise_redux_traits<Func, Evaluator>::Unrolling
     67 >
     68 struct packetwise_redux_impl;
     69 
     70 /* Perform the actual reduction with unrolling */
     71 template<typename Func, typename Evaluator>
     72 struct packetwise_redux_impl<Func, Evaluator, CompleteUnrolling>
     73 {
     74   typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
     75   typedef typename Evaluator::Scalar Scalar;
     76 
     77   template<typename PacketType>
     78   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
     79   PacketType run(const Evaluator &eval, const Func& func, Index /*size*/)
     80   {
     81     return redux_vec_unroller<Func, Evaluator, 0, packetwise_redux_traits<Func, Evaluator>::OuterSize>::template run<PacketType>(eval,func);
     82   }
     83 };
     84 
     85 /* Add a specialization of redux_vec_unroller for size==0 at compiletime.
     86  * This specialization is not required for general reductions, which is
     87  * why it is defined here.
     88  */
     89 template<typename Func, typename Evaluator, int Start>
     90 struct redux_vec_unroller<Func, Evaluator, Start, 0>
     91 {
     92   template<typename PacketType>
     93   EIGEN_DEVICE_FUNC
     94   static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f)
     95   {
     96     return packetwise_redux_empty_value<PacketType>(f);
     97   }
     98 };
     99 
    100 /* Perform the actual reduction for dynamic sizes */
    101 template<typename Func, typename Evaluator>
    102 struct packetwise_redux_impl<Func, Evaluator, NoUnrolling>
    103 {
    104   typedef typename Evaluator::Scalar Scalar;
    105   typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
    106 
    107   template<typename PacketType>
    108   EIGEN_DEVICE_FUNC
    109   static PacketType run(const Evaluator &eval, const Func& func, Index size)
    110   {
    111     if(size==0)
    112       return packetwise_redux_empty_value<PacketType>(func);
    113     
    114     const Index size4 = (size-1)&(~3);
    115     PacketType p = eval.template packetByOuterInner<Unaligned,PacketType>(0,0);
    116     Index i = 1;
    117     // This loop is optimized for instruction pipelining:
    118     // - each iteration generates two independent instructions
    119     // - thanks to branch prediction and out-of-order execution we have independent instructions across loops
    120     for(; i<size4; i+=4)
    121       p = func.packetOp(p,
    122             func.packetOp(
    123               func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+0,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+1,0)),
    124               func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+2,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+3,0))));
    125     for(; i<size; ++i)
    126       p = func.packetOp(p, eval.template packetByOuterInner<Unaligned,PacketType>(i,0));
    127     return p;
    128   }
    129 };
    130 
    131 template< typename ArgType, typename MemberOp, int Direction>
    132 struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
    133   : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
    134 {
    135   typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
    136   typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
    137   typedef typename internal::add_const_on_value_type<ArgTypeNested>::type ConstArgTypeNested;
    138   typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
    139   typedef typename ArgType::Scalar InputScalar;
    140   typedef typename XprType::Scalar Scalar;
    141   enum {
    142     TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
    143   };
    144   typedef typename MemberOp::template Cost<int(TraversalSize)> CostOpType;
    145   enum {
    146     CoeffReadCost = TraversalSize==Dynamic ? HugeCost
    147                   : TraversalSize==0 ? 1
    148                   : int(TraversalSize) * int(evaluator<ArgType>::CoeffReadCost) + int(CostOpType::value),
    149     
    150     _ArgFlags = evaluator<ArgType>::Flags,
    151 
    152     _Vectorizable =  bool(int(_ArgFlags)&PacketAccessBit)
    153                   && bool(MemberOp::Vectorizable)
    154                   && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0)
    155                   && (TraversalSize!=0),
    156                   
    157     Flags = (traits<XprType>::Flags&RowMajorBit)
    158           | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit)))
    159           | (_Vectorizable ? PacketAccessBit : 0)
    160           | LinearAccessBit,
    161     
    162     Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
    163   };
    164 
    165   EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
    166     : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
    167   {
    168     EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value)));
    169     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
    170   }
    171 
    172   typedef typename XprType::CoeffReturnType CoeffReturnType;
    173 
    174   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    175   const Scalar coeff(Index i, Index j) const
    176   {
    177     return coeff(Direction==Vertical ? j : i);
    178   }
    179 
    180   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    181   const Scalar coeff(Index index) const
    182   {
    183     return m_functor(m_arg.template subVector<DirectionType(Direction)>(index));
    184   }
    185 
    186   template<int LoadMode,typename PacketType>
    187   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    188   PacketType packet(Index i, Index j) const
    189   {
    190     return packet<LoadMode,PacketType>(Direction==Vertical ? j : i);
    191   }
    192   
    193   template<int LoadMode,typename PacketType>
    194   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
    195   PacketType packet(Index idx) const
    196   {
    197     enum { PacketSize = internal::unpacket_traits<PacketType>::size };
    198     typedef Block<const ArgTypeNestedCleaned,
    199                   Direction==Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
    200                   Direction==Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime),
    201                   true /* InnerPanel */> PanelType;
    202     
    203     PanelType panel(m_arg,
    204                     Direction==Vertical ? 0 : idx,
    205                     Direction==Vertical ? idx : 0,
    206                     Direction==Vertical ? m_arg.rows() : Index(PacketSize),
    207                     Direction==Vertical ? Index(PacketSize) : m_arg.cols());
    208 
    209     // FIXME
    210     // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of panel get reversed
    211     // and methods like packetByOuterInner do not make sense anymore in this context.
    212     // So let's just by pass "vectorization" in this case:
    213     if(PacketSize==1)
    214       return internal::pset1<PacketType>(coeff(idx));
    215     
    216     typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
    217     PanelEvaluator panel_eval(panel);
    218     typedef typename MemberOp::BinaryOp BinaryOp;
    219     PacketType p = internal::packetwise_redux_impl<BinaryOp,PanelEvaluator>::template run<PacketType>(panel_eval,m_functor.binaryFunc(),m_arg.outerSize());
    220     return p;
    221   }
    222 
    223 protected:
    224   ConstArgTypeNested m_arg;
    225   const MemberOp m_functor;
    226 };
    227 
    228 } // end namespace internal
    229 
    230 } // end namespace Eigen
    231 
    232 #endif // EIGEN_PARTIALREDUX_H