cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorReductionSycl.h (30074B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Mehdi Goli    Codeplay Software Ltd.
      5 // Ralph Potter  Codeplay Software Ltd.
      6 // Luke Iwanski  Codeplay Software Ltd.
      7 // Contact: <eigen@codeplay.com>
      8 //
      9 // This Source Code Form is subject to the terms of the Mozilla
     10 // Public License v. 2.0. If a copy of the MPL was not distributed
     11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     12 
     13 /*****************************************************************
     14  * TensorReductionSycl.h
     15  *
     16  * \brief:
     17  *  This is the specialization of the reduction operation. Two phase reduction approach 
     18  * is used since the GPU does not have Global Synchronization for global memory among 
     19  * different work-group/thread block. To solve the problem, we need to create two kernels 
     20  * to reduce the data, where the first kernel reduce the data locally and each local 
     21  * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
     22  * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. 
     23  * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
     24  * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
     25  *
     26  *****************************************************************/
     27 
     28 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
     29 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
     30 namespace Eigen {
     31 namespace TensorSycl {
     32 namespace internal {
     33 
     34 template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
     35 struct OpDefiner {
     36   typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
     37   typedef Op type;
     38   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
     39 
     40   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
     41                                                                             const Index &) {
     42     return accumulator;
     43   }
     44 };
     45 
     46 template <typename CoeffReturnType, typename Index>
     47 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
     48   typedef Eigen::internal::SumReducer<CoeffReturnType> type;
     49   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
     50     return type();
     51   }
     52 
     53   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
     54                                                                            const Index &scale) {
     55     ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
     56     return quotient_op(accumulator, CoeffReturnType(scale));
     57   }
     58 };
     59 
     60 template <typename CoeffReturnType, typename Index>
     61 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
     62   typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
     63   typedef Eigen::internal::SumReducer<CoeffReturnType> type;
     64   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
     65     return type();
     66   }
     67 
     68   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
     69                                                                             const Index &scale) {
     70     return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
     71   }
     72 };
     73 
     74 template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
     75           Index local_range>
     76 struct SecondStepFullReducer {
     77   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
     78       LocalAccessor;
     79   typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
     80   typedef typename OpDef::type Op;
     81   LocalAccessor scratch;
     82   InputAccessor aI;
     83   OutputAccessor outAcc;
     84   Op op;
     85   SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
     86       : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
     87 
     88   void operator()(cl::sycl::nd_item<1> itemID) {
     89     // Our empirical research shows that the best performance will be achieved
     90     // when there is only one element per thread to reduce in the second step.
     91     // in this step the second step reduction time is almost negligible.
     92     // Hence, in the second step of reduction the input size is fixed to the
     93     // local size, thus, there is only one element read per thread. The
     94     // algorithm must be changed if the number of reduce per thread in the
     95     // second step is greater than 1. Otherwise, the result will be wrong.
     96     const Index localid = itemID.get_local_id(0);
     97     auto aInPtr = aI.get_pointer() + localid;
     98     auto aOutPtr = outAcc.get_pointer();
     99     CoeffReturnType *scratchptr = scratch.get_pointer();
    100     CoeffReturnType accumulator = *aInPtr;
    101 
    102     scratchptr[localid] = op.finalize(accumulator);
    103     for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
    104       itemID.barrier(cl::sycl::access::fence_space::local_space);
    105       if (localid < offset) {
    106         op.reduce(scratchptr[localid + offset], &accumulator);
    107         scratchptr[localid] = op.finalize(accumulator);
    108       }
    109     }
    110     if (localid == 0) *aOutPtr = op.finalize(accumulator);
    111   }
    112 };
    113 
    114 // Full reduction first phase. In this version the vectorization is true and the reduction accept 
    115 // any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ). 
    116 template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
    117 class FullReductionKernelFunctor {
    118  public:
    119   typedef typename Evaluator::CoeffReturnType CoeffReturnType;
    120   typedef typename Evaluator::Index Index;
    121   typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
    122                     (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
    123       OpDef;
    124 
    125   typedef typename OpDef::type Op;
    126   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
    127   typedef typename Evaluator::PacketReturnType PacketReturnType;
    128   typedef
    129       typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
    130                                               PacketReturnType, CoeffReturnType>::type OutType;
    131   typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
    132       LocalAccessor;
    133   LocalAccessor scratch;
    134   Evaluator evaluator;
    135   EvaluatorPointerType final_output;
    136   Index rng;
    137   Op op;
    138 
    139   FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
    140                              Index rng_, OpType op_)
    141       : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
    142 
    143   void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
    144 
    145   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
    146   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
    147       const cl::sycl::nd_item<1> &itemID) {
    148     auto output_ptr = final_output.get_pointer();
    149     Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
    150     Index globalid = itemID.get_global_id(0);
    151     Index localid = itemID.get_local_id(0);
    152     Index step = Evaluator::PacketSize * itemID.get_global_range(0);
    153     Index start = Evaluator::PacketSize * globalid;
    154     // vectorizable parts
    155     PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
    156     for (Index i = start; i < VectorizedRange; i += step) {
    157       op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
    158     }
    159     globalid += VectorizedRange;
    160     // non vectorizable parts
    161     for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
    162       op.template reducePacket<PacketReturnType>(
    163           ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
    164               evaluator.impl().coeff(i), op.initialize()),
    165           &packetAccumulator);
    166     }
    167     scratch[localid] = packetAccumulator =
    168         OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
    169     // reduction parts // Local size is always power of 2
    170     EIGEN_UNROLL_LOOP
    171     for (Index offset = local_range / 2; offset > 0; offset /= 2) {
    172       itemID.barrier(cl::sycl::access::fence_space::local_space);
    173       if (localid < offset) {
    174         op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
    175         scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
    176       }
    177     }
    178     if (localid == 0) {
    179       output_ptr[itemID.get_group(0)] =
    180           op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
    181     }
    182   }
    183 
    184   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
    185   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
    186       const cl::sycl::nd_item<1> &itemID) {
    187     auto output_ptr = final_output.get_pointer();
    188     Index globalid = itemID.get_global_id(0);
    189     Index localid = itemID.get_local_id(0);
    190     // vectorizable parts
    191     CoeffReturnType accumulator = op.initialize();
    192     // non vectorizable parts
    193     for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
    194       op.reduce(evaluator.impl().coeff(i), &accumulator);
    195     }
    196     scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
    197 
    198     // reduction parts. the local size is always power of 2
    199     EIGEN_UNROLL_LOOP
    200     for (Index offset = local_range / 2; offset > 0; offset /= 2) {
    201       itemID.barrier(cl::sycl::access::fence_space::local_space);
    202       if (localid < offset) {
    203         op.reduce(scratch[localid + offset], &accumulator);
    204         scratch[localid] = op.finalize(accumulator);
    205       }
    206     }
    207     if (localid == 0) {
    208       output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
    209     }
    210   }
    211 };
    212 
    213 template <typename Evaluator, typename OpType>
    214 class GenericNondeterministicReducer {
    215  public:
    216   typedef typename Evaluator::CoeffReturnType CoeffReturnType;
    217   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
    218   typedef typename Evaluator::Index Index;
    219   typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
    220   typedef typename OpDef::type Op;
    221   template <typename Scratch>
    222   GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
    223                        Index range_, Index num_values_to_reduce_)
    224       : evaluator(evaluator_),
    225         output_accessor(output_accessor_),
    226         functor(OpDef::get_op(functor_)),
    227         range(range_),
    228         num_values_to_reduce(num_values_to_reduce_) {}
    229 
    230   void operator()(cl::sycl::nd_item<1> itemID) {
    231     auto output_accessor_ptr = output_accessor.get_pointer();
    232     /// const cast added as a naive solution to solve the qualifier drop error
    233     Index globalid = static_cast<Index>(itemID.get_global_linear_id());
    234     if (globalid < range) {
    235       CoeffReturnType accum = functor.initialize();
    236       Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
    237           evaluator, evaluator.firstInput(globalid), functor, &accum);
    238       output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
    239     }
    240   }
    241 
    242  private:
    243   Evaluator evaluator;
    244   EvaluatorPointerType output_accessor;
    245   Op functor;
    246   Index range;
    247   Index num_values_to_reduce;
    248 };
    249 
    250 enum class reduction_dim { inner_most, outer_most };
    251 // default is preserver
    252 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
    253 struct PartialReductionKernel {
    254   typedef typename Evaluator::CoeffReturnType CoeffReturnType;
    255   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
    256   typedef typename Evaluator::Index Index;
    257   typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
    258   typedef typename OpDef::type Op;
    259   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
    260       ScratchAcc;
    261   ScratchAcc scratch;
    262   Evaluator evaluator;
    263   EvaluatorPointerType output_accessor;
    264   Op op;
    265   const Index preserve_elements_num_groups;
    266   const Index reduce_elements_num_groups;
    267   const Index num_coeffs_to_preserve;
    268   const Index num_coeffs_to_reduce;
    269 
    270   PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
    271                          const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
    272                          const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
    273       : scratch(scratch_),
    274         evaluator(evaluator_),
    275         output_accessor(output_accessor_),
    276         op(OpDef::get_op(op_)),
    277         preserve_elements_num_groups(preserve_elements_num_groups_),
    278         reduce_elements_num_groups(reduce_elements_num_groups_),
    279         num_coeffs_to_preserve(num_coeffs_to_preserve_),
    280         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
    281 
    282   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
    283                                                                  CoeffReturnType &accumulator) {
    284     if (globalPId >= num_coeffs_to_preserve) {
    285       return;
    286     }
    287     Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
    288                                                           : globalRId + (globalPId * num_coeffs_to_reduce);
    289     Index localOffset = globalRId;
    290 
    291     const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
    292     const Index per_thread_global_stride =
    293         rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
    294     for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
    295       op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
    296       localOffset += per_thread_local_stride;
    297       global_offset += per_thread_global_stride;
    298     }
    299   }
    300   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
    301     const Index linearLocalThreadId = itemID.get_local_id(0);
    302     Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
    303                                                            : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
    304     Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
    305                                                            : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
    306     const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
    307                                                            : itemID.get_group(0) / reduce_elements_num_groups;
    308     const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
    309                                                            : itemID.get_group(0) % reduce_elements_num_groups;
    310 
    311     Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
    312     const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
    313     auto scratchPtr = scratch.get_pointer().get();
    314     auto outPtr =
    315         output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
    316     CoeffReturnType accumulator = op.initialize();
    317 
    318     element_wise_reduce(globalRId, globalPId, accumulator);
    319 
    320     accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
    321     scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
    322         accumulator;
    323     if (rt == reduction_dim::inner_most) {
    324       pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
    325       rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
    326       globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
    327     }
    328 
    329     /* Apply the reduction operation between the current local
    330      * id and the one on the other half of the vector. */
    331     auto out_scratch_ptr =
    332         scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
    333     itemID.barrier(cl::sycl::access::fence_space::local_space);
    334     if (rt == reduction_dim::inner_most) {
    335       accumulator = *out_scratch_ptr;
    336     }
    337     // The Local LocalThreadSizeR is always power of 2
    338     EIGEN_UNROLL_LOOP
    339     for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
    340       if (rLocalThreadId < offset) {
    341         op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
    342         // The result has already been divided for mean reducer in the
    343         // previous reduction so no need to divide furthermore
    344         *out_scratch_ptr = op.finalize(accumulator);
    345       }
    346       /* All threads collectively read from global memory into local.
    347        * The barrier ensures all threads' IO is resolved before
    348        * execution continues (strictly speaking, all threads within
    349        * a single work-group - there is no co-ordination between
    350        * work-groups, only work-items). */
    351       itemID.barrier(cl::sycl::access::fence_space::local_space);
    352     }
    353 
    354     if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
    355       outPtr[globalPId] = op.finalize(accumulator);
    356     }
    357   }
    358 };
    359 
    360 template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
    361 struct SecondStepPartialReduction {
    362   typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
    363   typedef typename OpDef::type Op;
    364   typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
    365       ScratchAccessor;
    366   InputAccessor input_accessor;
    367   OutputAccessor output_accessor;
    368   Op op;
    369   const Index num_coeffs_to_preserve;
    370   const Index num_coeffs_to_reduce;
    371 
    372   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
    373                                                                    OutputAccessor output_accessor_, OpType op_,
    374                                                                    const Index num_coeffs_to_preserve_,
    375                                                                    const Index num_coeffs_to_reduce_)
    376       : input_accessor(input_accessor_),
    377         output_accessor(output_accessor_),
    378         op(OpDef::get_op(op_)),
    379         num_coeffs_to_preserve(num_coeffs_to_preserve_),
    380         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
    381 
    382   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
    383     const Index globalId = itemID.get_global_id(0);
    384 
    385     if (globalId >= num_coeffs_to_preserve) return;
    386 
    387     auto in_ptr = input_accessor.get_pointer() + globalId;
    388 
    389     OutScalar accumulator = op.initialize();
    390 // num_coeffs_to_reduce is not bigger that 256
    391     for (Index i = 0; i < num_coeffs_to_reduce; i++) {
    392       op.reduce(*in_ptr, &accumulator);
    393       in_ptr += num_coeffs_to_preserve;
    394     }
    395     output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
    396   }
    397 };  // namespace internal
    398 
    399 template <typename Index, Index LTP, Index LTR, bool BC_>
    400 struct ReductionPannel {
    401   static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
    402   static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
    403   static EIGEN_CONSTEXPR bool BC = BC_;
    404 };
    405 
    406 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
    407 struct PartialReducerLauncher {
    408   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
    409   typedef typename Self::CoeffReturnType CoeffReturnType;
    410   typedef typename Self::Storage Storage;
    411   typedef typename Self::Index Index;
    412   typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
    413       PannelParameters;
    414 
    415   typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
    416 
    417   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
    418                   Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
    419     Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
    420 
    421     // getPowerOfTwo makes sure local range is power of 2 and <=
    422     // maxSyclThreadPerBlock this will help us to avoid extra check on the
    423     // kernel
    424     static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
    425                     (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
    426                   "The Local thread size must be a power of 2 for the reduction "
    427                   "operation");
    428 
    429     EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
    430     // In this step, we force the code not to be more than 2-step reduction:
    431     // Our empirical research shows that if each thread reduces at least 64
    432     // elemnts individually, we get better performance. However, this can change
    433     // on different platforms. In this step we force the code not to be
    434     // morthan step reduction: Our empirical research shows that for inner_most
    435     // dim reducer, it is better to have 8 group in a reduce dimension for sizes
    436     // > 1024 to achieve the best performance.
    437     const Index reductionPerThread = 64;
    438     Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
    439     const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
    440     Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
    441     const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
    442     const Index globalRange = pNumGroups * rNumGroups * localRange;
    443 
    444     EIGEN_CONSTEXPR Index scratchSize =
    445         PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
    446     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
    447     if (rNumGroups > 1) {
    448       CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
    449           dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
    450       EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
    451       dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
    452           self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
    453           num_coeffs_to_reduce);
    454 
    455       typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
    456           SecondStepPartialReductionKernel;
    457 
    458       dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
    459           temp_accessor, output,
    460           cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
    461           reducer, num_coeffs_to_preserve, rNumGroups);
    462 
    463       self.device().deallocate_temp(temp_pointer);
    464     } else {
    465       dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
    466           self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
    467           num_coeffs_to_reduce);
    468     }
    469     return false;
    470   }
    471 };
    472 }  // namespace internal
    473 }  // namespace TensorSycl
    474 
    475 namespace internal {
    476 
    477 template <typename Self, typename Op, bool Vectorizable>
    478 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
    479   typedef typename Self::CoeffReturnType CoeffReturnType;
    480   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
    481   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
    482   static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
    483   static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
    484     typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
    485     static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
    486                     (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
    487                   "The Local thread size must be a power of 2 for the reduction "
    488                   "operation");
    489     EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
    490 
    491     typename Self::Index inputSize = self.impl().dimensions().TotalSize();
    492     // In this step we force the code not to be more than 2-step reduction:
    493     // Our empirical research shows that if each thread reduces at least 512
    494     // elemnts individually, we get better performance.
    495     const Index reductionPerThread = 2048;
    496     // const Index num_work_group =
    497     Index reductionGroup = dev.getPowerOfTwo(
    498         (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
    499     const Index num_work_group = std::min(reductionGroup, local_range);
    500     // 1
    501     // ? local_range
    502     // : 1);
    503     const Index global_range = num_work_group * local_range;
    504 
    505     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
    506     typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
    507     if (num_work_group > 1) {
    508       CoeffReturnType *temp_pointer =
    509           static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
    510       typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
    511       dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
    512                                                                       local_range, inputSize, reducer);
    513 
    514       typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
    515                                                           EvaluatorPointerType, Index, local_range>
    516           GenericRKernel;
    517       dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
    518           tmp_global_accessor, data,
    519           cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
    520           reducer);
    521 
    522       dev.deallocate_temp(temp_pointer);
    523     } else {
    524       dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
    525                                                                       reducer);
    526     }
    527   }
    528 };
    529 // vectorizable inner_most most dim preserver
    530 // col reduction
    531 template <typename Self, typename Op>
    532 struct OuterReducer<Self, Op, Eigen::SyclDevice> {
    533   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
    534 
    535   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
    536                   typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
    537                   typename Self::Index num_coeffs_to_preserve) {
    538     return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
    539         Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
    540                                                                                  num_coeffs_to_reduce,
    541                                                                                  num_coeffs_to_preserve);
    542   }
    543 };
    544 // row reduction
    545 template <typename Self, typename Op>
    546 struct InnerReducer<Self, Op, Eigen::SyclDevice> {
    547   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
    548 
    549   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
    550                   typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
    551                   typename Self::Index num_coeffs_to_preserve) {
    552     return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
    553         Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
    554                                                                                  num_coeffs_to_reduce,
    555                                                                                  num_coeffs_to_preserve);
    556   }
    557 };
    558 
    559 // ArmgMax uses this kernel for partial reduction//
    560 // TODO(@mehdi.goli) come up with a better kernel
    561 // generic partial reduction
    562 template <typename Self, typename Op>
    563 struct GenericReducer<Self, Op, Eigen::SyclDevice> {
    564   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
    565   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
    566                   typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
    567                   typename Self::Index num_coeffs_to_preserve) {
    568     typename Self::Index range, GRange, tileSize;
    569     dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
    570 
    571     dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
    572                                        TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
    573         self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
    574         reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
    575     return false;
    576   }
    577 };
    578 
    579 }  // namespace internal
    580 }  // namespace Eigen
    581 
    582 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP