cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorExecutor.h (26655B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
     11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
     12 
     13 namespace Eigen {
     14 
     15 /**
     16  * \class TensorExecutor
     17  * \ingroup CXX11_Tensor_Module
     18  *
     19  * \brief The tensor executor class.
     20  *
     21  * This class is responsible for launch the evaluation of the expression on
     22  * the specified computing device.
     23  *
     24  * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
     25  *                      instructions)
     26  * @tparam Tiling       can use block based tensor evaluation
     27  *                      (see TensorBlock.h)
     28  */
     29 namespace internal {
     30 
     31 /**
     32  * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
     33  * expensive. If expression has at least one broadcast op in it, and it supports
     34  * block based evaluation, we always prefer it, even for the small tensors. For
     35  * all other tileable ops, block evaluation overhead for small tensors (fits
     36  * into L1) is too large, and we fallback on vectorized evaluation.
     37  */
     38 
     39 // TODO(ezhulenev): Add specializations for all other types of Tensor ops.
     40 
     41 template<typename Expression>
     42 struct ExpressionHasTensorBroadcastingOp {
     43   enum { value = false };
     44 };
     45 
     46 template<typename LhsXprType, typename RhsXprType>
     47 struct ExpressionHasTensorBroadcastingOp<
     48     const TensorAssignOp<LhsXprType, RhsXprType> > {
     49   enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
     50 };
     51 
     52 template<typename UnaryOp, typename XprType>
     53 struct ExpressionHasTensorBroadcastingOp<
     54     const TensorCwiseUnaryOp<UnaryOp, XprType> > {
     55   enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
     56 };
     57 
     58 template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
     59 struct ExpressionHasTensorBroadcastingOp<
     60     const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
     61   enum {
     62     value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
     63         ExpressionHasTensorBroadcastingOp<RhsXprType>::value
     64   };
     65 };
     66 
     67 template<typename Broadcast, typename XprType>
     68 struct ExpressionHasTensorBroadcastingOp<
     69     const TensorBroadcastingOp<Broadcast, XprType> > {
     70   enum { value = true };
     71 };
     72 
     73 // -------------------------------------------------------------------------- //
     74 
     75 /**
     76  * Default strategy: the expression is evaluated sequentially with a single cpu
     77  * thread, without vectorization and block evaluation.
     78  */
     79 template <typename Expression, typename Device, bool Vectorizable,
     80           TiledEvaluation Tiling>
     81 class TensorExecutor {
     82  public:
     83   typedef typename Expression::Index StorageIndex;
     84 
     85   // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
     86   // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
     87   // violation. If this template is instantiated with a non-default device, it
     88   // means that this header file was included without defining
     89   // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
     90   static_assert(std::is_same<Device, DefaultDevice>::value,
     91                 "Default executor instantiated with non-default device. "
     92                 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
     93                 "EIGEN_USE_SYCL before including Eigen headers.");
     94 
     95   EIGEN_DEVICE_FUNC
     96   static EIGEN_STRONG_INLINE void run(const Expression& expr,
     97                                       const Device& device = Device()) {
     98     TensorEvaluator<Expression, Device> evaluator(expr, device);
     99     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
    100     if (needs_assign) {
    101       const StorageIndex size = array_prod(evaluator.dimensions());
    102       for (StorageIndex i = 0; i < size; ++i) {
    103         evaluator.evalScalar(i);
    104       }
    105     }
    106     evaluator.cleanup();
    107   }
    108 };
    109 
    110 /**
    111  * Default async execution strategy is not implemented. Currently it's only
    112  * available for ThreadPoolDevice (see definition below).
    113  */
    114 template <typename Expression, typename Device, typename DoneCallback,
    115           bool Vectorizable, TiledEvaluation Tiling>
    116 class TensorAsyncExecutor {};
    117 
    118 /**
    119  * Process all the data with a single cpu thread, using vectorized instructions.
    120  */
    121 template <typename Expression>
    122 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
    123                      /*Tiling=*/TiledEvaluation::Off> {
    124  public:
    125   typedef typename Expression::Index StorageIndex;
    126 
    127   EIGEN_DEVICE_FUNC
    128   static EIGEN_STRONG_INLINE void run(
    129       const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
    130     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
    131     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
    132     if (needs_assign) {
    133       const StorageIndex size = array_prod(evaluator.dimensions());
    134       const int PacketSize = unpacket_traits<typename TensorEvaluator<
    135           Expression, DefaultDevice>::PacketReturnType>::size;
    136 
    137       // Give compiler a strong possibility to unroll the loop. But don't insist
    138       // on unrolling, because if the function is expensive compiler should not
    139       // unroll the loop at the expense of inlining.
    140       const StorageIndex UnrolledSize =
    141           (size / (4 * PacketSize)) * 4 * PacketSize;
    142       for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
    143         for (StorageIndex j = 0; j < 4; j++) {
    144           evaluator.evalPacket(i + j * PacketSize);
    145         }
    146       }
    147       const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
    148       for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
    149         evaluator.evalPacket(i);
    150       }
    151       for (StorageIndex i = VectorizedSize; i < size; ++i) {
    152         evaluator.evalScalar(i);
    153       }
    154     }
    155     evaluator.cleanup();
    156   }
    157 };
    158 
    159 /**
    160  * Process all the data with a single cpu thread, using blocks of data. By
    161  * sizing a block to fit L1 cache we get better cache performance.
    162  */
    163 template <typename Expression, bool Vectorizable>
    164 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
    165                      /*Tiling=*/TiledEvaluation::On> {
    166  public:
    167   typedef typename traits<Expression>::Scalar Scalar;
    168   typedef typename remove_const<Scalar>::type ScalarNoConst;
    169 
    170   typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
    171   typedef typename traits<Expression>::Index StorageIndex;
    172 
    173   static const int NumDims = traits<Expression>::NumDimensions;
    174 
    175   EIGEN_DEVICE_FUNC
    176   static EIGEN_STRONG_INLINE void run(const Expression& expr,
    177                          const DefaultDevice& device = DefaultDevice()) {
    178     typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
    179         TensorBlockMapper;
    180 
    181     typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
    182         TensorBlockDesc;
    183     typedef internal::TensorBlockScratchAllocator<DefaultDevice>
    184         TensorBlockScratch;
    185 
    186     Evaluator evaluator(expr, device);
    187 
    188     // TODO(ezhulenev): Do not use tiling for small tensors?
    189     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
    190 
    191     if (needs_assign) {
    192       // Query expression tree for desired block size/shape.
    193       const TensorBlockResourceRequirements requirements =
    194           evaluator.getResourceRequirements();
    195 
    196       const TensorBlockMapper block_mapper(
    197           typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
    198           requirements);
    199 
    200       // Share scratch memory allocator between all blocks.
    201       TensorBlockScratch scratch(device);
    202 
    203       const StorageIndex total_block_count = block_mapper.blockCount();
    204       for (StorageIndex i = 0; i < total_block_count; ++i) {
    205         TensorBlockDesc desc = block_mapper.blockDescriptor(i);
    206         evaluator.evalBlock(desc, scratch);
    207         scratch.reset();
    208       }
    209     }
    210     evaluator.cleanup();
    211   }
    212 };
    213 
    214 /**
    215  * Multicore strategy: the index space is partitioned and each partition is
    216  * executed on a single core.
    217  *
    218  * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
    219  *     pool, and will block the caller thread until all tasks are finished.
    220  *
    221  * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
    222  *     the ThreadPoolDevice managed thread pool, and will return immediately.
    223  *     It will call 'done' callback after all tasks are finished.
    224  */
    225 #ifdef EIGEN_USE_THREADS
    226 
    227 template <typename TensorBlockMapper>
    228 struct TensorExecutorTilingContext {
    229   TensorExecutorTilingContext() = default;
    230   TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
    231                               const TensorOpCost& b_cost, size_t b_aligned_size)
    232       : block_mapper(b_mapper),
    233         cost(b_cost),
    234         aligned_blocksize(b_aligned_size) {}
    235 
    236   TensorBlockMapper block_mapper;  // navigate through blocks
    237   TensorOpCost cost;               // cost of computing a single block
    238   size_t aligned_blocksize;        // block size after memory alignment
    239 };
    240 
    241 // Computes a block evaluation parameters, and allocates temporary memory buffer
    242 // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
    243 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
    244 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
    245     const Evaluator& evaluator) {
    246   // Query expression tree for desired block size/shape.
    247   TensorBlockResourceRequirements requirements =
    248       evaluator.getResourceRequirements();
    249 
    250   // Update target block size based on cost model.
    251   double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
    252       1, requirements.cost_per_coeff);
    253   requirements.size = static_cast<size_t>(1.0 / taskSize);
    254 
    255   TensorBlockMapper block_mapper(
    256       typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
    257       requirements);
    258 
    259   size_t block_size = block_mapper.blockTotalSize();
    260   const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
    261   const size_t aligned_blocksize =
    262       align *
    263       divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
    264 
    265   return {block_mapper, requirements.cost_per_coeff * block_size,
    266           aligned_blocksize};
    267 }
    268 
    269 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
    270 struct EvalRange {
    271   static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
    272                   const StorageIndex lastIdx) {
    273     Evaluator evaluator = *evaluator_in;
    274     eigen_assert(lastIdx >= firstIdx);
    275     for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
    276       evaluator.evalScalar(i);
    277     }
    278   }
    279 
    280   static StorageIndex alignBlockSize(StorageIndex size) { return size; }
    281 };
    282 
    283 template <typename Evaluator, typename StorageIndex>
    284 struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
    285   static const int PacketSize =
    286       unpacket_traits<typename Evaluator::PacketReturnType>::size;
    287 
    288   static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
    289                   const StorageIndex lastIdx) {
    290     Evaluator evaluator = *evaluator_in;
    291     eigen_assert(lastIdx >= firstIdx);
    292     StorageIndex i = firstIdx;
    293     if (lastIdx - firstIdx >= PacketSize) {
    294       eigen_assert(firstIdx % PacketSize == 0);
    295       StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
    296       // Give compiler a strong possibility to unroll the loop. But don't insist
    297       // on unrolling, because if the function is expensive compiler should not
    298       // unroll the loop at the expense of inlining.
    299       for (; i <= last_chunk_offset; i += 4 * PacketSize) {
    300         for (StorageIndex j = 0; j < 4; j++) {
    301           evaluator.evalPacket(i + j * PacketSize);
    302         }
    303       }
    304       last_chunk_offset = lastIdx - PacketSize;
    305       for (; i <= last_chunk_offset; i += PacketSize) {
    306         evaluator.evalPacket(i);
    307       }
    308     }
    309     for (; i < lastIdx; ++i) {
    310       evaluator.evalScalar(i);
    311     }
    312   }
    313 
    314   static StorageIndex alignBlockSize(StorageIndex size) {
    315     // Align block size to packet size and account for unrolling in run above.
    316     if (size >= 16 * PacketSize) {
    317       return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
    318     }
    319     // Aligning to 4 * PacketSize would increase block size by more than 25%.
    320     return (size + PacketSize - 1) & ~(PacketSize - 1);
    321   }
    322 };
    323 
    324 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
    325 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
    326  public:
    327   typedef typename Expression::Index StorageIndex;
    328 
    329   static EIGEN_STRONG_INLINE void run(const Expression& expr,
    330                          const ThreadPoolDevice& device) {
    331     typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
    332     typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
    333 
    334     Evaluator evaluator(expr, device);
    335     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
    336     if (needs_assign) {
    337       const StorageIndex size = array_prod(evaluator.dimensions());
    338       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
    339                          EvalRange::alignBlockSize,
    340                          [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
    341                            EvalRange::run(&evaluator, firstIdx, lastIdx);
    342                          });
    343     }
    344     evaluator.cleanup();
    345   }
    346 };
    347 
    348 template <typename Expression, bool Vectorizable>
    349 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
    350                      /*Tiling=*/TiledEvaluation::On> {
    351  public:
    352   typedef typename traits<Expression>::Index IndexType;
    353   typedef typename traits<Expression>::Scalar Scalar;
    354   typedef typename remove_const<Scalar>::type ScalarNoConst;
    355 
    356   static const int NumDims = traits<Expression>::NumDimensions;
    357 
    358   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
    359   typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
    360   typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
    361 
    362   typedef internal::TensorBlockDescriptor<NumDims, IndexType>
    363       TensorBlockDesc;
    364   typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
    365       TensorBlockScratch;
    366 
    367   static EIGEN_STRONG_INLINE void run(const Expression& expr,
    368                                       const ThreadPoolDevice& device) {
    369     Evaluator evaluator(expr, device);
    370 
    371     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
    372     if (needs_assign) {
    373       const TilingContext tiling =
    374           internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
    375                                                    Vectorizable>(evaluator);
    376 
    377       auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
    378                                                        IndexType lastBlockIdx) {
    379         TensorBlockScratch scratch(device);
    380 
    381         for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
    382              ++block_idx) {
    383           TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
    384           evaluator.evalBlock(desc, scratch);
    385           scratch.reset();
    386         }
    387       };
    388 
    389       // Evaluate small expressions directly as a single block.
    390       if (tiling.block_mapper.blockCount() == 1) {
    391         TensorBlockScratch scratch(device);
    392         TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
    393         evaluator.evalBlock(desc, scratch);
    394       } else {
    395         device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
    396                            eval_block);
    397       }
    398     }
    399     evaluator.cleanup();
    400   }
    401 };
    402 
    403 template <typename Expression, typename DoneCallback, bool Vectorizable,
    404           TiledEvaluation Tiling>
    405 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
    406                           Vectorizable, Tiling> {
    407  public:
    408   typedef typename Expression::Index StorageIndex;
    409   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
    410 
    411   static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
    412                                            const ThreadPoolDevice& device,
    413                                            DoneCallback done) {
    414     TensorAsyncExecutorContext* const ctx =
    415         new TensorAsyncExecutorContext(expr, device, std::move(done));
    416 
    417     const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
    418       if (!need_assign) {
    419         delete ctx;
    420         return;
    421       }
    422 
    423       typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
    424       const StorageIndex size = array_prod(ctx->evaluator.dimensions());
    425       device.parallelForAsync(
    426           size, ctx->evaluator.costPerCoeff(Vectorizable),
    427           EvalRange::alignBlockSize,
    428           [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
    429             EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
    430           },
    431           [ctx]() { delete ctx; });
    432     };
    433 
    434     ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
    435   }
    436 
    437  private:
    438   struct TensorAsyncExecutorContext {
    439     TensorAsyncExecutorContext(const Expression& expr,
    440                                const ThreadPoolDevice& thread_pool,
    441                                DoneCallback done)
    442         : evaluator(expr, thread_pool), on_done(std::move(done)) {}
    443 
    444     ~TensorAsyncExecutorContext() {
    445       evaluator.cleanup();
    446       on_done();
    447     }
    448 
    449     Evaluator evaluator;
    450 
    451    private:
    452     DoneCallback on_done;
    453   };
    454 };
    455 
    456 template <typename Expression, typename DoneCallback, bool Vectorizable>
    457 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
    458                           Vectorizable, /*Tileable*/ TiledEvaluation::On> {
    459  public:
    460   typedef typename traits<Expression>::Index IndexType;
    461   typedef typename traits<Expression>::Scalar Scalar;
    462   typedef typename remove_const<Scalar>::type ScalarNoConst;
    463 
    464   static const int NumDims = traits<Expression>::NumDimensions;
    465 
    466   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
    467   typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
    468   typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
    469 
    470   typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
    471   typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
    472       TensorBlockScratch;
    473 
    474   static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
    475                                            const ThreadPoolDevice& device,
    476                                            DoneCallback done) {
    477 
    478     TensorAsyncExecutorContext* const ctx =
    479         new TensorAsyncExecutorContext(expr, device, std::move(done));
    480 
    481     const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
    482       if (!need_assign) {
    483         delete ctx;
    484         return;
    485       }
    486 
    487       ctx->tiling = internal::GetTensorExecutorTilingContext<
    488           Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
    489 
    490       auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
    491         TensorBlockScratch scratch(ctx->device);
    492 
    493         for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
    494              ++block_idx) {
    495           TensorBlockDesc desc =
    496               ctx->tiling.block_mapper.blockDescriptor(block_idx);
    497           ctx->evaluator.evalBlock(desc, scratch);
    498           scratch.reset();
    499         }
    500       };
    501 
    502       // Evaluate small expressions directly as a single block.
    503       if (ctx->tiling.block_mapper.blockCount() == 1) {
    504         TensorBlockScratch scratch(ctx->device);
    505         TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
    506         ctx->evaluator.evalBlock(desc, scratch);
    507         delete ctx;
    508       } else {
    509         ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
    510                                      ctx->tiling.cost, eval_block,
    511                                      [ctx]() { delete ctx; });
    512       }
    513     };
    514 
    515     ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
    516   }
    517 
    518  private:
    519   struct TensorAsyncExecutorContext {
    520     TensorAsyncExecutorContext(const Expression& expr,
    521                                const ThreadPoolDevice& thread_pool,
    522                                DoneCallback done)
    523         : device(thread_pool),
    524           evaluator(expr, thread_pool),
    525           on_done(std::move(done)) {}
    526 
    527     ~TensorAsyncExecutorContext() {
    528       evaluator.cleanup();
    529       on_done();
    530     }
    531 
    532     const ThreadPoolDevice& device;
    533     Evaluator evaluator;
    534     TilingContext tiling;
    535 
    536    private:
    537     DoneCallback on_done;
    538   };
    539 };
    540 
    541 #endif  // EIGEN_USE_THREADS
    542 
    543 // GPU: the evaluation of the expression is offloaded to a GPU.
    544 #if defined(EIGEN_USE_GPU)
    545 
    546 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
    547 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
    548  public:
    549   typedef typename Expression::Index StorageIndex;
    550   static void run(const Expression& expr, const GpuDevice& device);
    551 };
    552 
    553 #if defined(EIGEN_GPUCC)
    554 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
    555 struct EigenMetaKernelEval {
    556   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
    557   void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
    558     for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
    559       eval.evalScalar(i);
    560     }
    561   }
    562 };
    563 
    564 template <typename Evaluator, typename StorageIndex>
    565 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
    566   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
    567   void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
    568     const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
    569     const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
    570     const StorageIndex vectorized_step_size = step_size * PacketSize;
    571 
    572     // Use the vector path
    573     for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
    574          i += vectorized_step_size) {
    575       eval.evalPacket(i);
    576     }
    577     for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
    578       eval.evalScalar(i);
    579     }
    580   }
    581 };
    582 
    583 template <typename Evaluator, typename StorageIndex>
    584 __global__ void
    585 __launch_bounds__(1024)
    586 EigenMetaKernel(Evaluator eval, StorageIndex size) {
    587 
    588   const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
    589   const StorageIndex step_size = blockDim.x * gridDim.x;
    590 
    591   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
    592   EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
    593 }
    594 
    595 /*static*/
    596 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
    597 EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
    598     const Expression& expr, const GpuDevice& device) {
    599   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
    600   const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
    601   if (needs_assign) {
    602 
    603     const int block_size = device.maxGpuThreadsPerBlock();
    604     const int max_blocks = device.getNumGpuMultiProcessors() *
    605                            device.maxGpuThreadsPerMultiProcessor() / block_size;
    606     const StorageIndex size = array_prod(evaluator.dimensions());
    607     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
    608     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
    609 
    610     LAUNCH_GPU_KERNEL(
    611         (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
    612         num_blocks, block_size, 0, device, evaluator, size);
    613   }
    614   evaluator.cleanup();
    615 }
    616 
    617 #endif  // EIGEN_GPUCC
    618 #endif  // EIGEN_USE_GPU
    619 
    620 // SYCL Executor policy
    621 #ifdef EIGEN_USE_SYCL
    622 
    623 template <typename Evaluator>
    624 struct ExecExprFunctorKernel {
    625   typedef typename Evaluator::Index Index;
    626   Evaluator evaluator;
    627   const Index range;
    628   template <typename Scratch>
    629   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
    630       const Scratch, Evaluator evaluator_, const Index range_)
    631       : evaluator(evaluator_), range(range_) {}
    632 
    633   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
    634       cl::sycl::nd_item<1> itemID) {
    635     compute(itemID);
    636   }
    637   template <bool is_vec = Evaluator::PacketAccess>
    638   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
    639   compute(const cl::sycl::nd_item<1>& itemID) {
    640     Index gId = static_cast<Index>(itemID.get_global_linear_id());
    641     Index total_threads = itemID.get_global_range(0);
    642 
    643     for (Index i = gId; i < range; i += total_threads) {
    644       evaluator.evalScalar(i);
    645     }
    646   }
    647   template <bool is_vec = Evaluator::PacketAccess>
    648   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
    649   compute(const cl::sycl::nd_item<1>& itemID) {
    650     const Index vectorizedRange =
    651         (range / Evaluator::PacketSize) * Evaluator::PacketSize;
    652     Index gId = static_cast<Index>(itemID.get_global_linear_id());
    653     const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
    654     const Index start = Evaluator::PacketSize * gId;
    655     for (Index i = start; i < vectorizedRange; i += step) {
    656       evaluator.evalPacket(i);
    657     }
    658     gId += vectorizedRange;
    659     for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
    660       evaluator.evalScalar(i);
    661     }
    662   }
    663 };
    664 
    665 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
    666 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
    667  public:
    668   typedef typename Expression::Index Index;
    669   static EIGEN_STRONG_INLINE void run(const Expression& expr,
    670                                       const Eigen::SyclDevice& dev) {
    671     typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
    672     Evaluator evaluator(expr, dev);
    673     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
    674     if (needs_assign) {
    675       Index range, GRange, tileSize;
    676       Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
    677       total_size = (total_size == 0) ? 1 : total_size;
    678       const int PacketSize =
    679           Eigen::PacketType<typename Evaluator::CoeffReturnType,
    680                             Eigen::SyclDevice>::size;
    681       Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
    682       dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
    683       range = total_size;
    684 
    685       dev.template nullary_kernel_launcher<
    686           typename Evaluator::CoeffReturnType,
    687           ExecExprFunctorKernel<Evaluator> >(
    688           evaluator,
    689           cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
    690                                 cl::sycl::range<1>(tileSize)),
    691           Index(1), range);
    692     }
    693     evaluator.cleanup();
    694   }
    695 };
    696 
    697 #endif
    698 
    699 } // end namespace internal
    700 
    701 } // end namespace Eigen
    702 
    703 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H