cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorReduction.h (44395B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 // Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
      6 //
      7 // This Source Code Form is subject to the terms of the Mozilla
      8 // Public License v. 2.0. If a copy of the MPL was not distributed
      9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     10 
     11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
     12 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
     13 
     14 // clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
     15 // so we'll use a macro to make clang happy.
     16 #ifndef KERNEL_FRIEND
     17 #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
     18 #define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
     19 #else
     20 #define KERNEL_FRIEND friend
     21 #endif
     22 #endif
     23 
     24 
     25 namespace Eigen {
     26 
     27 
     28 /** \class TensorReduction
     29   * \ingroup CXX11_Tensor_Module
     30   *
     31   * \brief Tensor reduction class.
     32   *
     33   */
     34 
     35 namespace internal {
     36   template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
     37   struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
     38  : traits<XprType>
     39 {
     40   typedef traits<XprType> XprTraits;
     41   typedef typename XprTraits::Scalar Scalar;
     42   typedef typename XprTraits::StorageKind StorageKind;
     43   typedef typename XprTraits::Index Index;
     44   typedef typename XprType::Nested Nested;
     45   static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
     46   static const int Layout = XprTraits::Layout;
     47   typedef typename XprTraits::PointerType PointerType;
     48 
     49   template <class T> struct MakePointer {
     50     // Intermediate typedef to workaround MSVC issue.
     51     typedef MakePointer_<T> MakePointerT;
     52     typedef typename MakePointerT::Type Type;
     53   };
     54 };
     55 
     56 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
     57 struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
     58 {
     59   typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
     60 };
     61 
     62 template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
     63 struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
     64 {
     65   typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
     66 };
     67 
     68 
     69 template <typename OutputDims> struct DimInitializer {
     70   template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
     71   static void run(const InputDims& input_dims,
     72                   const array<bool, internal::array_size<InputDims>::value>& reduced,
     73                   OutputDims* output_dims, ReducedDims* reduced_dims) {
     74     const int NumInputDims = internal::array_size<InputDims>::value;
     75     int outputIndex = 0;
     76     int reduceIndex = 0;
     77     for (int i = 0; i < NumInputDims; ++i) {
     78       if (reduced[i]) {
     79         (*reduced_dims)[reduceIndex] = input_dims[i];
     80         ++reduceIndex;
     81       } else {
     82         (*output_dims)[outputIndex] = input_dims[i];
     83         ++outputIndex;
     84       }
     85     }
     86   }
     87 };
     88 
     89 template <> struct DimInitializer<Sizes<> > {
     90   template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
     91   static void run(const InputDims& input_dims, const array<bool, Rank>&,
     92                   Sizes<>*, array<Index, Rank>* reduced_dims) {
     93     const int NumInputDims = internal::array_size<InputDims>::value;
     94     for (int i = 0; i < NumInputDims; ++i) {
     95       (*reduced_dims)[i] = input_dims[i];
     96     }
     97   }
     98 };
     99 
    100 
    101 template <typename ReducedDims, int NumTensorDims, int Layout>
    102 struct are_inner_most_dims {
    103   static const bool value = false;
    104 };
    105 template <typename ReducedDims, int NumTensorDims, int Layout>
    106 struct preserve_inner_most_dims {
    107   static const bool value = false;
    108 };
    109 
    110 #if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
    111 template <typename ReducedDims, int NumTensorDims>
    112 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
    113   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
    114   static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
    115   static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
    116   static const bool value = tmp1 & tmp2 & tmp3;
    117 };
    118 template <typename ReducedDims, int NumTensorDims>
    119 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
    120   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
    121   static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
    122   static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
    123   static const bool value = tmp1 & tmp2 & tmp3;
    124 
    125 };
    126 template <typename ReducedDims, int NumTensorDims>
    127 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
    128   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
    129   static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
    130   static const bool value = tmp1 & tmp2;
    131 
    132 };
    133 template <typename ReducedDims, int NumTensorDims>
    134 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
    135   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
    136   static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
    137   static const bool value = tmp1 & tmp2;
    138 };
    139 #endif
    140 
    141 
    142 template <int DimIndex, typename Self, typename Op>
    143 struct GenericDimReducer {
    144   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
    145     EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
    146     for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
    147       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
    148       GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
    149     }
    150   }
    151 };
    152 template <typename Self, typename Op>
    153 struct GenericDimReducer<0, Self, Op> {
    154   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
    155     for (int j = 0; j < self.m_reducedDims[0]; ++j) {
    156       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
    157       reducer.reduce(self.m_impl.coeff(input), accum);
    158     }
    159   }
    160 };
    161 template <typename Self, typename Op>
    162 struct GenericDimReducer<-1, Self, Op> {
    163   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
    164     reducer.reduce(self.m_impl.coeff(index), accum);
    165   }
    166 };
    167 
    168 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
    169           bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
    170                                    !Self::ReducerTraits::IsExactlyAssociative)>
    171 struct InnerMostDimReducer {
    172   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
    173     typename Self::CoeffReturnType accum = reducer.initialize();
    174     for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
    175       reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
    176     }
    177     return reducer.finalize(accum);
    178   }
    179 };
    180 
    181 template <typename Self, typename Op>
    182 struct InnerMostDimReducer<Self, Op, true, false> {
    183   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
    184     const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
    185     const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
    186     typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
    187     for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
    188       reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
    189     }
    190     typename Self::CoeffReturnType accum = reducer.initialize();
    191     for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
    192       reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
    193     }
    194     return reducer.finalizeBoth(accum, paccum);
    195   }
    196 };
    197 
    198 #if !defined(EIGEN_HIPCC) 
    199 static const int kLeafSize = 1024;
    200 
    201 template <typename Self, typename Op>
    202 struct InnerMostDimReducer<Self, Op, false, true> {
    203   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
    204   reduce(const Self& self, typename Self::Index firstIndex,
    205          typename Self::Index numValuesToReduce, Op& reducer) {
    206     typename Self::CoeffReturnType accum = reducer.initialize();
    207     if (numValuesToReduce > kLeafSize) {
    208       const typename Self::Index half = numValuesToReduce / 2;
    209       reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
    210       reducer.reduce(
    211           reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
    212           &accum);
    213     } else {
    214       for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
    215         reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
    216       }
    217     }
    218     return reducer.finalize(accum);
    219   }
    220 };
    221 
    222 template <typename Self, typename Op>
    223 struct InnerMostDimReducer<Self, Op, true, true> {
    224   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
    225   reduce(const Self& self, typename Self::Index firstIndex,
    226          typename Self::Index numValuesToReduce, Op& reducer) {
    227     const typename Self::Index packetSize =
    228         internal::unpacket_traits<typename Self::PacketReturnType>::size;
    229     typename Self::CoeffReturnType accum = reducer.initialize();
    230     if (numValuesToReduce > packetSize * kLeafSize) {
    231       // Make sure the split point is aligned on a packet boundary.
    232       const typename Self::Index split =
    233           packetSize *
    234           divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
    235                 packetSize);
    236       const typename Self::Index num_left =
    237           numext::mini(split - firstIndex, numValuesToReduce);
    238       reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
    239       if (num_left < numValuesToReduce) {
    240         reducer.reduce(
    241             reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
    242       }
    243       return reducer.finalize(accum);
    244     } else {
    245       const typename Self::Index UnrollSize =
    246           (numValuesToReduce / (2*packetSize)) * 2*packetSize;
    247       const typename Self::Index VectorizedSize =
    248           (numValuesToReduce / packetSize) * packetSize;
    249       typename Self::PacketReturnType paccum =
    250           reducer.template initializePacket<typename Self::PacketReturnType>();
    251       typename Self::PacketReturnType paccum2 =
    252           reducer.template initializePacket<typename Self::PacketReturnType>();
    253       for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
    254         reducer.reducePacket(
    255             self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
    256         reducer.reducePacket(
    257             self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
    258             &paccum2);
    259       }
    260       for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
    261         reducer.reducePacket(self.m_impl.template packet<Unaligned>(
    262                                  firstIndex + j), &paccum);
    263       }
    264       reducer.reducePacket(paccum2, &paccum);
    265       for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
    266            ++j) {
    267         reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
    268       }
    269       return reducer.finalizeBoth(accum, paccum);
    270     }
    271   }
    272 };
    273 #endif
    274  
    275 template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
    276 struct InnerMostDimPreserver {
    277   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
    278     eigen_assert(false && "should never be called");
    279   }
    280 };
    281 
    282 template <int DimIndex, typename Self, typename Op>
    283 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
    284   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
    285     EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
    286     for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
    287       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
    288       InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
    289     }
    290   }
    291 };
    292 
    293 template <typename Self, typename Op>
    294 struct InnerMostDimPreserver<0, Self, Op, true> {
    295   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
    296     for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
    297       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
    298       reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
    299     }
    300   }
    301 };
    302 template <typename Self, typename Op>
    303 struct InnerMostDimPreserver<-1, Self, Op, true> {
    304   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
    305     eigen_assert(false && "should never be called");
    306   }
    307 };
    308 
    309 // Default full reducer
    310 template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
    311 struct FullReducer {
    312   static const bool HasOptimizedImplementation = false;
    313 
    314   static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
    315     const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
    316     *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
    317   }
    318 };
    319 
    320 
    321 #ifdef EIGEN_USE_THREADS
    322 // Multithreaded full reducers
    323 template <typename Self, typename Op,
    324           bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
    325 struct FullReducerShard {
    326   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
    327                   typename Self::Index numValuesToReduce, Op& reducer,
    328                   typename Self::CoeffReturnType* output) {
    329     *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
    330         self, firstIndex, numValuesToReduce, reducer);
    331   }
    332 };
    333 
    334 // Multithreaded full reducer
    335 template <typename Self, typename Op, bool Vectorizable>
    336 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
    337   static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
    338   static const Index PacketSize =
    339       unpacket_traits<typename Self::PacketReturnType>::size;
    340 
    341   // launch one reducer per thread and accumulate the result.
    342   static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
    343                   typename Self::CoeffReturnType* output) {
    344     typedef typename Self::Index Index;
    345     const Index num_coeffs = array_prod(self.m_impl.dimensions());
    346     if (num_coeffs == 0) {
    347       *output = reducer.finalize(reducer.initialize());
    348       return;
    349     }
    350     const TensorOpCost cost =
    351         self.m_impl.costPerCoeff(Vectorizable) +
    352         TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
    353                      PacketSize);
    354     const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
    355         num_coeffs, cost, device.numThreads());
    356     if (num_threads == 1) {
    357       *output =
    358           InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
    359       return;
    360     }
    361     const Index blocksize =
    362         std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
    363     const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
    364     eigen_assert(num_coeffs >= numblocks * blocksize);
    365 
    366     Barrier barrier(internal::convert_index<unsigned int>(numblocks));
    367     MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
    368     for (Index i = 0; i < numblocks; ++i) {
    369       device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
    370                                   self, i * blocksize, blocksize, reducer,
    371                                   &shards[i]);
    372     }
    373     typename Self::CoeffReturnType finalShard;
    374     if (numblocks * blocksize < num_coeffs) {
    375       finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
    376           self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
    377           reducer);
    378     } else {
    379       finalShard = reducer.initialize();
    380     }
    381     barrier.Wait();
    382 
    383     for (Index i = 0; i < numblocks; ++i) {
    384       reducer.reduce(shards[i], &finalShard);
    385     }
    386     *output = reducer.finalize(finalShard);
    387   }
    388 };
    389 
    390 #endif
    391 
    392 
    393 // Default inner reducer
    394 template <typename Self, typename Op, typename Device>
    395 struct InnerReducer {
    396   static const bool HasOptimizedImplementation = false;
    397 
    398   EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
    399     eigen_assert(false && "Not implemented");
    400     return true;
    401   }
    402 };
    403 
    404 // Default outer reducer
    405 template <typename Self, typename Op, typename Device>
    406 struct OuterReducer {
    407   static const bool HasOptimizedImplementation = false;
    408 
    409   EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
    410     eigen_assert(false && "Not implemented");
    411     return true;
    412   }
    413 };
    414 
    415 #ifdef EIGEN_USE_SYCL
    416 // Default Generic reducer
    417 template <typename Self, typename Op, typename Device>
    418 struct GenericReducer {
    419   static const bool HasOptimizedImplementation = false;
    420 
    421   EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
    422     eigen_assert(false && "Not implemented");
    423     return true;
    424   }
    425 };
    426 #endif
    427 
    428 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
    429 template <int B, int N, typename S, typename R, typename I_>
    430 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
    431 
    432 
    433 #if defined(EIGEN_HAS_GPU_FP16)
    434 template <typename S, typename R, typename I_>
    435 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
    436 template <int B, int N, typename S, typename R, typename I_>
    437 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
    438 template <int NPT, typename S, typename R, typename I_>
    439 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
    440 
    441 #endif
    442 
    443 template <int NPT, typename S, typename R, typename I_>
    444 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
    445 
    446 template <int NPT, typename S, typename R, typename I_>
    447 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
    448 #endif
    449 
    450 /**
    451  * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
    452  * This allows the reduction to have a different type for the accumulator than the input data type.
    453  * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
    454  * with the accumulator and the other for reducing two accumulators.
    455  * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
    456  * some properties of the input.
    457  */
    458 template <typename Op, typename CoeffReturnType>
    459 struct ReductionReturnType {
    460 #if defined(EIGEN_USE_SYCL)
    461   typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
    462 #else
    463   typedef typename remove_const<CoeffReturnType>::type type;
    464 #endif
    465 };
    466 
    467 }  // end namespace internal
    468 
    469 
    470 template <typename Op, typename Dims, typename XprType,  template <class> class MakePointer_>
    471 class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
    472   public:
    473     typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
    474     typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
    475     typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
    476     typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
    477     typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
    478     typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
    479 
    480     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    481     TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
    482     { }
    483     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    484     TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
    485     { }
    486 
    487     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    488     const XprType& expression() const { return m_expr; }
    489     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    490     const Dims& dims() const { return m_dims; }
    491     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    492     const Op& reducer() const { return m_reducer; }
    493 
    494   protected:
    495     typename XprType::Nested m_expr;
    496     const Dims m_dims;
    497     const Op m_reducer;
    498 };
    499 
    500 template<typename ArgType, typename Device>
    501 struct TensorReductionEvaluatorBase;
    502 
    503 // Eval as rvalue
    504 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
    505 struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
    506 {
    507   typedef internal::reducer_traits<Op, Device> ReducerTraits;
    508   typedef Dims ReducedDims;
    509   typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
    510   typedef typename XprType::Index Index;
    511   typedef ArgType ChildType;
    512   typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
    513   static const int NumInputDims = internal::array_size<InputDimensions>::value;
    514   static const int NumReducedDims = internal::array_size<Dims>::value;
    515   static const int NumOutputDims = NumInputDims - NumReducedDims;
    516   typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
    517   typedef typename XprType::Scalar Scalar;
    518   typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
    519   static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
    520   typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
    521   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
    522   static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
    523 
    524   typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
    525   typedef StorageMemory<CoeffReturnType, Device> Storage;
    526   typedef typename Storage::Type EvaluatorPointerType;
    527 
    528     // Subset of strides of the input tensor for the non-reduced dimensions.
    529   // Indexed by output dimensions.
    530   static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
    531 
    532   enum {
    533     IsAligned = false,
    534     PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
    535     BlockAccess = false,
    536     PreferBlockAccess = true,
    537     Layout = TensorEvaluator<ArgType, Device>::Layout,
    538     CoordAccess = false,  // to be implemented
    539     RawAccess = false
    540   };
    541 
    542   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
    543 
    544   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
    545   typedef internal::TensorBlockNotImplemented TensorBlock;
    546   //===--------------------------------------------------------------------===//
    547 
    548   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
    549   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
    550   static const bool RunningFullReduction = (NumOutputDims==0);
    551 
    552   EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
    553       : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
    554   {
    555     EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
    556     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
    557                         YOU_MADE_A_PROGRAMMING_MISTAKE);
    558 
    559     // Build the bitmap indicating if an input dimension is reduced or not.
    560     for (int i = 0; i < NumInputDims; ++i) {
    561       m_reduced[i] = false;
    562     }
    563     for (int i = 0; i < NumReducedDims; ++i) {
    564       eigen_assert(op.dims()[i] >= 0);
    565       eigen_assert(op.dims()[i] < NumInputDims);
    566       m_reduced[op.dims()[i]] = true;
    567     }
    568 
    569     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
    570     internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
    571 
    572     // Precompute output strides.
    573     if (NumOutputDims > 0) {
    574       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    575         m_outputStrides[0] = 1;
    576         for (int i = 1; i < NumOutputDims; ++i) {
    577           m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
    578           m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
    579         }
    580       } else {
    581         m_outputStrides[NumOutputDims - 1] = 1;
    582         for (int i = NumOutputDims - 2; i >= 0; --i) {
    583           m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
    584           m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
    585         }
    586       }
    587     }
    588 
    589     // Precompute input strides.
    590     if (NumInputDims > 0) {
    591       array<Index, NumInputDims> input_strides;
    592       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    593         input_strides[0] = 1;
    594         for (int i = 1; i < NumInputDims; ++i) {
    595           input_strides[i] = input_strides[i-1] * input_dims[i-1];
    596         }
    597       } else {
    598         input_strides.back() = 1;
    599         for (int i = NumInputDims - 2; i >= 0; --i) {
    600           input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
    601         }
    602       }
    603 
    604       int outputIndex = 0;
    605       int reduceIndex = 0;
    606       for (int i = 0; i < NumInputDims; ++i) {
    607         if (m_reduced[i]) {
    608           m_reducedStrides[reduceIndex] = input_strides[i];
    609           ++reduceIndex;
    610         } else {
    611           m_preservedStrides[outputIndex] = input_strides[i];
    612           m_output_to_input_dim_map[outputIndex] = i;
    613           ++outputIndex;
    614         }
    615       }
    616     }
    617 
    618     // Special case for full reductions
    619     if (NumOutputDims == 0) {
    620       m_preservedStrides[0] = internal::array_prod(input_dims);
    621     }
    622 
    623     m_numValuesToReduce =
    624         NumOutputDims == 0
    625             ? internal::array_prod(input_dims)
    626             : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
    627                   ? m_preservedStrides[0]
    628                   : m_preservedStrides[NumOutputDims - 1];
    629   }
    630 
    631   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
    632 
    633   EIGEN_STRONG_INLINE
    634   bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
    635     // Use the FullReducer if possible.
    636     if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
    637         internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
    638         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
    639          !RunningOnGPU))) {
    640       bool need_assign = false;
    641       if (!data) {
    642         m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
    643         data = m_result;
    644         need_assign = true;
    645       }
    646       Op reducer(m_reducer);
    647       internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
    648       return need_assign;
    649     }
    650 
    651     // Attempt to use an optimized reduction.
    652     else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
    653       bool reducing_inner_dims = true;
    654       for (int i = 0; i < NumReducedDims; ++i) {
    655         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    656           reducing_inner_dims &= m_reduced[i];
    657         } else {
    658           reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
    659         }
    660       }
    661       if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
    662           (reducing_inner_dims || ReducingInnerMostDims)) {
    663         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
    664         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
    665         if (!data) {
    666           if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
    667             data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
    668             m_result = data;
    669           }
    670           else {
    671             return true;
    672           }
    673         }
    674         Op reducer(m_reducer);
    675         // For SYCL this if always return false
    676         if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
    677           if (m_result) {
    678             m_device.deallocate_temp(m_result);
    679             m_result = NULL;
    680           }
    681           return true;
    682         } else {
    683           return (m_result != NULL);
    684         }
    685       }
    686 
    687       bool preserving_inner_dims = true;
    688       for (int i = 0; i < NumReducedDims; ++i) {
    689         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    690           preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
    691         } else {
    692           preserving_inner_dims &= m_reduced[i];
    693         }
    694       }
    695       if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
    696           preserving_inner_dims) {
    697         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
    698         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
    699         if (!data) {
    700           if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
    701             data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
    702             m_result = data;
    703           }
    704           else {
    705             return true;
    706           }
    707         }
    708         Op reducer(m_reducer);
    709         // For SYCL this if always return false
    710         if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
    711           if (m_result) {
    712             m_device.deallocate_temp(m_result);
    713             m_result = NULL;
    714           }
    715           return true;
    716         } else {
    717           return (m_result != NULL);
    718         }
    719       }
    720       #if defined(EIGEN_USE_SYCL)
    721       // If there is no Optimised version for SYCL, the reduction expression 
    722       // must break into two subexpression and use the SYCL generic Reducer on the device.
    723       if(RunningOnSycl) {
    724          const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
    725          const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
    726          if (!data) {
    727            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
    728            m_result = data;
    729          }
    730          Op reducer(m_reducer);
    731          internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
    732          return (m_result != NULL);
    733        }
    734       #endif
    735     }
    736     return true;
    737   }
    738 
    739 #ifdef EIGEN_USE_THREADS
    740   template <typename EvalSubExprsCallback>
    741   EIGEN_STRONG_INLINE
    742       void
    743       evalSubExprsIfNeededAsync(EvaluatorPointerType data,
    744                                 EvalSubExprsCallback done) {
    745     m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) {
    746       done(evalSubExprsIfNeededCommon(data));
    747     });
    748   }
    749 #endif
    750 
    751   EIGEN_STRONG_INLINE
    752   bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    753     m_impl.evalSubExprsIfNeeded(NULL);
    754     return evalSubExprsIfNeededCommon(data);
    755   }
    756 
    757   EIGEN_STRONG_INLINE void cleanup() {
    758     m_impl.cleanup();
    759     if (m_result) {
    760       m_device.deallocate_temp(m_result);
    761       m_result = NULL;
    762     }
    763   }
    764 
    765   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
    766   {
    767     if (( RunningFullReduction || RunningOnGPU) && m_result ) {
    768       return *(m_result + index);
    769     }
    770     Op reducer(m_reducer);
    771     if (ReducingInnerMostDims || RunningFullReduction) {
    772       const Index num_values_to_reduce =
    773         (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
    774       return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
    775                                                              num_values_to_reduce, reducer);
    776     } else {
    777       typename Self::CoeffReturnType accum = reducer.initialize();
    778       internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
    779       return reducer.finalize(accum);
    780     }
    781   }
    782 
    783   // TODO(bsteiner): provide a more efficient implementation.
    784   template<int LoadMode>
    785   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
    786   {
    787     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    788     eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
    789 
    790     if (RunningOnGPU && m_result) {
    791       return internal::pload<PacketReturnType>(m_result + index);
    792     }
    793 
    794     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
    795     if (ReducingInnerMostDims) {
    796       const Index num_values_to_reduce =
    797         (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
    798       const Index firstIndex = firstInput(index);
    799       for (Index i = 0; i < PacketSize; ++i) {
    800         Op reducer(m_reducer);
    801         values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
    802                                                                     num_values_to_reduce, reducer);
    803       }
    804     } else if (PreservingInnerMostDims) {
    805       const Index firstIndex = firstInput(index);
    806       const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
    807       // TBD: extend this the the n innermost dimensions that we preserve.
    808       if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
    809         Op reducer(m_reducer);
    810         typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
    811         internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
    812         return reducer.finalizePacket(accum);
    813       } else {
    814         for (int i = 0; i < PacketSize; ++i) {
    815           values[i] = coeff(index + i);
    816         }
    817       }
    818     } else {
    819       for (int i = 0; i < PacketSize; ++i) {
    820         values[i] = coeff(index + i);
    821       }
    822     }
    823     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
    824     return rslt;
    825   }
    826 
    827   // Must be called after evalSubExprsIfNeeded().
    828   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
    829     if (RunningFullReduction && m_result) {
    830       return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
    831     } else {
    832       const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
    833       const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
    834       return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
    835           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
    836     }
    837   }
    838 
    839   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
    840   EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
    841   EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
    842 #ifdef EIGEN_USE_SYCL
    843   // binding placeholder accessors to a command group handler for SYCL
    844   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    845     m_impl.bind(cgh);
    846     m_result.bind(cgh);
    847   }
    848 #endif
    849 
    850   private:
    851   template <int, typename, typename> friend struct internal::GenericDimReducer;
    852   template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer;
    853   template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
    854   template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
    855 #ifdef EIGEN_USE_THREADS
    856   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
    857 #endif
    858 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
    859   template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
    860 #if defined(EIGEN_HAS_GPU_FP16)
    861   template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*);
    862   template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*);
    863   template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
    864 #endif
    865   template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
    866 
    867   template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
    868 #endif
    869 
    870 #if defined(EIGEN_USE_SYCL)
    871  template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
    872  // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
    873  template <typename, typename, typename> friend struct internal::GenericReducer;
    874 #endif
    875 
    876 
    877   template <typename S, typename O, typename D> friend struct internal::InnerReducer;
    878 
    879   struct BlockIteratorState {
    880     Index input_dim;
    881     Index output_size;
    882     Index output_count;
    883   };
    884 
    885   // Returns the Index in the input tensor of the first value that needs to be
    886   // used to compute the reduction at output index "index".
    887   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
    888     if (ReducingInnerMostDims) {
    889       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    890         return index * m_preservedStrides[0];
    891       } else {
    892         return index * m_preservedStrides[NumPreservedStrides - 1];
    893       }
    894     }
    895     // TBD: optimize the case where we preserve the innermost dimensions.
    896     Index startInput = 0;
    897     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    898       for (int i = NumOutputDims - 1; i > 0; --i) {
    899         // This is index_i in the output tensor.
    900         const Index idx = index / m_outputStrides[i];
    901         startInput += idx * m_preservedStrides[i];
    902         index -= idx * m_outputStrides[i];
    903       }
    904       if (PreservingInnerMostDims) {
    905         eigen_assert(m_preservedStrides[0] == 1);
    906         startInput += index;
    907       } else {
    908         startInput += index * m_preservedStrides[0];
    909       }
    910     } else {
    911       for (int i = 0; i < NumOutputDims - 1; ++i) {
    912         // This is index_i in the output tensor.
    913         const Index idx = index / m_outputStrides[i];
    914         startInput += idx * m_preservedStrides[i];
    915         index -= idx * m_outputStrides[i];
    916       }
    917       if (PreservingInnerMostDims) {
    918         eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
    919         startInput += index;
    920       } else {
    921         startInput += index * m_preservedStrides[NumPreservedStrides - 1];
    922       }
    923     }
    924     return startInput;
    925   }
    926 
    927   // Bitmap indicating if an input dimension is reduced or not.
    928   array<bool, NumInputDims> m_reduced;
    929   // Dimensions of the output of the operation.
    930   Dimensions m_dimensions;
    931   // Precomputed strides for the output tensor.
    932   array<Index, NumOutputDims> m_outputStrides;
    933   array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
    934   array<Index, NumPreservedStrides> m_preservedStrides;
    935   // Map from output to input dimension index.
    936   array<Index, NumOutputDims> m_output_to_input_dim_map;
    937   // How many values go into each reduction
    938   Index m_numValuesToReduce;
    939 
    940   // Subset of strides of the input tensor for the reduced dimensions.
    941   // Indexed by reduced dimensions.
    942   array<Index, NumReducedDims> m_reducedStrides;
    943   // Size of the input dimensions that are reduced.
    944   // Indexed by reduced dimensions.
    945   array<Index, NumReducedDims> m_reducedDims;
    946 
    947   // Evaluator for the input expression.
    948   TensorEvaluator<ArgType, Device> m_impl;
    949 
    950   // Operation to apply for computing the reduction.
    951   Op m_reducer;
    952 
    953   // For full reductions
    954 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
    955   static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
    956   static const bool RunningOnSycl = false;
    957 #elif defined(EIGEN_USE_SYCL)
    958 static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
    959 static const bool RunningOnGPU = false;
    960 #else
    961   static const bool RunningOnGPU = false;
    962   static const bool RunningOnSycl = false;
    963 #endif
    964   EvaluatorPointerType m_result;
    965 
    966   const Device EIGEN_DEVICE_REF m_device;
    967 };
    968 
    969 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
    970 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
    971 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
    972   typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
    973   EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
    974 };
    975 
    976 
    977 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
    978 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
    979 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
    980 
    981   typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
    982   EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
    983   // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
    984   //Therefore the coeff function should be overridden by for SYCL kernel
    985   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
    986     return *(this->data() + index);
    987   }
    988   // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
    989   //Therefore the packet function should be overridden by for SYCL kernel
    990   template<int LoadMode>
    991   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
    992     return internal::pload<typename Base::PacketReturnType>(this->data() + index);
    993   }
    994 };
    995 
    996 } // end namespace Eigen
    997 
    998 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H