cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorConvolutionSycl.h (27527B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Mehdi Goli    Codeplay Software Ltd.
      5 // Ralph Potter  Codeplay Software Ltd.
      6 // Luke Iwanski  Codeplay Software Ltd.
      7 // Contact: <eigen@codeplay.com>
      8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
      9 
     10 //
     11 // This Source Code Form is subject to the terms of the Mozilla
     12 // Public License v. 2.0. If a copy of the MPL was not distributed
     13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
     14 
     15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
     16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
     17 
     18 namespace Eigen {
     19 
     20 /** \class TensorConvolution
     21  * \ingroup CXX11_Tensor_Module
     22  *
     23  * \brief Tensor convolution class.
     24  *
     25  *
     26  */
     27 
     28 enum class convolution_type { CONV1D, CONV2D, CONV3D };
     29 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
     30           typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
     31 struct EigenConvolutionKernel;
     32 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
     33           typename Kernel_accessor, typename Buffer_accessor>
     34 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
     35                               Buffer_accessor, convolution_type::CONV1D> {
     36   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
     37       Local_accessor;
     38   Local_accessor local_acc;
     39   Evaluator device_evaluator;
     40   Kernel_accessor kernel_filter;
     41   Buffer_accessor buffer_acc;
     42   internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
     43   const size_t kernelSize;
     44   const cl::sycl::range<2> input_range;
     45   EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
     46                          Buffer_accessor buffer_acc_,
     47                          internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
     48                          const size_t kernelSize_, const cl::sycl::range<2> input_range_)
     49       : local_acc(local_acc_),
     50         device_evaluator(device_evaluator_),
     51         kernel_filter(kernel_filter_),
     52         buffer_acc(buffer_acc_),
     53         indexMapper(indexMapper_),
     54         kernelSize(kernelSize_),
     55         input_range(input_range_) {}
     56 
     57   template <typename BooleanDim2>
     58   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
     59     return (boolean_check[0] && boolean_check[1]);
     60   }
     61   void operator()(cl::sycl::nd_item<2> itemID) {
     62     auto buffer_ptr = buffer_acc.get_pointer();
     63     auto kernel_ptr = kernel_filter.get_pointer();
     64     // the required row to be calculated for the for each plane in shered memory
     65     const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
     66     const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
     67     const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
     68     const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
     69     /// fill the shared memory
     70     for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
     71       const size_t local_index = i + plane_kernel_offset;
     72       const size_t tensor_index =
     73           plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
     74 
     75       local_acc[local_index] =
     76           (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
     77               ? device_evaluator.coeff(tensor_index)
     78               : CoeffReturnType(0);
     79     }
     80 
     81     itemID.barrier(cl::sycl::access::fence_space::local_space);
     82 
     83     // calculate the convolution // output start x
     84     const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
     85     if (boundary_check(itemID.get_global_id() < input_range)) {
     86       CoeffReturnType result = static_cast<CoeffReturnType>(0);
     87       const size_t index = plane_kernel_offset + itemID.get_local_id(0);
     88       for (size_t k = 0; k < kernelSize; ++k) {
     89         result += (local_acc[k + index] * kernel_ptr[k]);
     90       }
     91       const size_t tensor_index =
     92           indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
     93           indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
     94       buffer_ptr[tensor_index] = result;
     95     }
     96   }
     97 };
     98 
     99 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
    100           typename Kernel_accessor, typename Buffer_accessor>
    101 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
    102                               Buffer_accessor, convolution_type::CONV2D> {
    103   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
    104       Local_accessor;
    105   Local_accessor local_acc;
    106   Evaluator device_evaluator;
    107   Kernel_accessor kernel_filter;
    108   Buffer_accessor buffer_acc;
    109   internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
    110   const cl::sycl::range<2> kernel_size;
    111   const cl::sycl::range<3> input_range;
    112   EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
    113                          Buffer_accessor buffer_acc_,
    114                          internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
    115                          const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
    116       : local_acc(local_acc_),
    117         device_evaluator(device_evaluator_),
    118         kernel_filter(kernel_filter_),
    119         buffer_acc(buffer_acc_),
    120         indexMapper(indexMapper_),
    121         kernel_size(kernel_size_),
    122         input_range(input_range_) {}
    123   template <typename BooleanDim3>
    124   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
    125     return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
    126   }
    127 
    128   void operator()(cl::sycl::nd_item<3> itemID) {
    129     auto buffer_ptr = buffer_acc.get_pointer();
    130     auto kernel_ptr = kernel_filter.get_pointer();
    131     // the required row to be calculated for the for each plane in shered memory
    132     const auto num_input = cl::sycl::range<2>{
    133         (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
    134 
    135     const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
    136     const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
    137 
    138     const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
    139                                                  itemID.get_group(1) * itemID.get_local_range()[1]};
    140       
    141     // fill the local memory
    142     bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
    143     for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
    144       const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
    145       bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); 
    146       for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
    147         const size_t local_index = i + local_input_offset;
    148         const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
    149                                                              i + input_offset[0], j + input_offset[1]);
    150         local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
    151                                   in_range_dim1 && in_range_dim2)
    152                                      ? device_evaluator.coeff(tensor_index)
    153                                      : CoeffReturnType(0);
    154       }
    155     }
    156 
    157     itemID.barrier(cl::sycl::access::fence_space::local_space);
    158 
    159     // output offset start for each thread
    160     const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
    161                                                   itemID.get_group(1) * itemID.get_local_range()[1]};
    162 
    163     if (boundary_check(itemID.get_global_id() < input_range)) {
    164       CoeffReturnType result = static_cast<CoeffReturnType>(0);
    165 
    166       for (size_t j = 0; j < kernel_size[1]; j++) {
    167         size_t kernel_offset = kernel_size[0] * j;
    168         const size_t index =
    169             (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
    170         for (size_t i = 0; i < kernel_size[0]; i++) {
    171           result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
    172         }
    173       }
    174       const size_t tensor_index =
    175           indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
    176           indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
    177                                                              itemID.get_local_id(1) + output_offset[1]);
    178 
    179       buffer_ptr[tensor_index] = result;
    180     }
    181   }
    182 };
    183 
    184 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
    185           typename Kernel_accessor, typename Buffer_accessor>
    186 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
    187                               Buffer_accessor, convolution_type::CONV3D> {
    188   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
    189       Local_accessor;
    190   Local_accessor local_acc;
    191   Evaluator device_evaluator;
    192   Kernel_accessor kernel_filter;
    193   Buffer_accessor buffer_acc;
    194   internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
    195   const cl::sycl::range<3> kernel_size;
    196   const cl::sycl::range<3> input_range;
    197   const size_t numP;
    198 
    199   EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
    200                          Buffer_accessor buffer_acc_,
    201                          internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
    202                          const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
    203                          const size_t numP_)
    204       : local_acc(local_acc_),
    205         device_evaluator(device_evaluator_),
    206         kernel_filter(kernel_filter_),
    207         buffer_acc(buffer_acc_),
    208         indexMapper(indexMapper_),
    209         kernel_size(kernel_size_),
    210         input_range(input_range_),
    211         numP(numP_) {}
    212   template <typename BooleanDim3>
    213   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
    214     return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
    215   }
    216   void operator()(cl::sycl::nd_item<3> itemID) {
    217     auto buffer_ptr = buffer_acc.get_pointer();
    218     auto kernel_ptr = kernel_filter.get_pointer();
    219     const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
    220 
    221     const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
    222 
    223     const auto output_offset =
    224           cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
    225 
    226     for (size_t p = 0; p < numP; p++) {
    227       /// fill the shared memory
    228       const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
    229       for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
    230         size_t local_index_dim2 = num_input[0] * num_input[1] * k;
    231         bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
    232         for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
    233           bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
    234           size_t local_index_dim1 = (num_input[0] * j)  + local_index_dim2;
    235           for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
    236             bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
    237             const size_t local_index = local_index_dim1 + i;
    238             const size_t tensor_index =
    239                 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
    240                                          i + input_offset[0], j + input_offset[1], k + input_offset[2]);
    241             local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
    242           }
    243         }
    244       }
    245       itemID.barrier(cl::sycl::access::fence_space::local_space);
    246 
    247       // calculate the convolution
    248 
    249       if (boundary_check(itemID.get_global_id() < input_range)) {
    250         CoeffReturnType result = static_cast<CoeffReturnType>(0);
    251         for (size_t k = 0; k < kernel_size[2]; k++) {
    252           for (size_t j = 0; j < kernel_size[1]; j++) {
    253             for (size_t i = 0; i < kernel_size[0]; i++) {
    254               const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
    255               const size_t local_index =
    256                   ((i + itemID.get_local_id(0)) +
    257                    num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
    258 
    259               result += (local_acc[local_index] * kernel_ptr[kernel_index]);
    260             }
    261           }
    262         }
    263         const size_t tensor_index =
    264             indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
    265             indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
    266         buffer_ptr[tensor_index] = result;
    267       }
    268 
    269       itemID.barrier(cl::sycl::access::fence_space::local_space);
    270     }
    271   }
    272 };
    273 
    274 template <typename Indices, typename InputArgType, typename KernelArgType>
    275 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
    276   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
    277 
    278   static const int NumDims =
    279       internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
    280   static const int NumKernelDims = internal::array_size<Indices>::value;
    281   typedef typename XprType::Index Index;
    282   typedef DSizes<Index, NumDims> Dimensions;
    283   typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
    284   typedef const Eigen::SyclDevice Device;
    285   typedef typename XprType::CoeffReturnType CoeffReturnType;
    286   typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
    287   typedef typename InputArgType::Scalar Scalar;
    288   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
    289   typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
    290   typedef typename Storage::Type EvaluatorPointerType;
    291   typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
    292 
    293   enum {
    294     IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
    295                 TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
    296     PacketAccess = false,
    297     BlockAccess = false,
    298     PreferBlockAccess = false,
    299     Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
    300     CoordAccess = false,  // to be implemented
    301     RawAccess = false
    302   };
    303 
    304   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
    305   typedef internal::TensorBlockNotImplemented TensorBlock;
    306   //===--------------------------------------------------------------------===//
    307 
    308   TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
    309       : m_inputImpl(op.inputExpression(), device),
    310         m_kernelArg(op.kernelExpression()),
    311         m_kernelImpl(op.kernelExpression(), device),
    312         m_indices(op.indices()),
    313         m_buf(NULL),
    314         m_kernel(NULL),
    315         m_local_kernel(false),
    316         m_device(device) {
    317     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
    318                          static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
    319                         YOU_MADE_A_PROGRAMMING_MISTAKE);
    320 
    321     const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
    322     const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
    323         m_kernelImpl.dimensions();
    324 
    325     m_dimensions = m_inputImpl.dimensions();
    326     for (int i = 0; i < NumKernelDims; ++i) {
    327       const Index index = op.indices()[i];
    328       const Index input_dim = input_dims[index];
    329       const Index kernel_dim = kernel_dims[i];
    330       const Index result_dim = input_dim - kernel_dim + 1;
    331       m_dimensions[index] = result_dim;
    332     }
    333   }
    334 
    335   EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
    336 
    337   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
    338     preloadKernel();
    339     m_inputImpl.evalSubExprsIfNeeded(NULL);
    340     if (data) {
    341       executeEval(data);
    342       return false;
    343     } else {
    344       m_buf = (EvaluatorPointerType)m_device.get(
    345           (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
    346       executeEval(m_buf);
    347       return true;
    348     }
    349   }
    350 
    351   EIGEN_STRONG_INLINE void cleanup() {
    352     m_inputImpl.cleanup();
    353     if (m_buf) {
    354       m_device.deallocate_temp(m_buf);
    355       m_buf = NULL;
    356     }
    357     if (m_local_kernel) {
    358       m_device.deallocate_temp(m_kernel);
    359       m_local_kernel = false;
    360     }
    361     m_kernel = NULL;
    362   }
    363   /// used by sycl in order to build the sycl buffer
    364   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
    365   /// used by sycl in order to build the sycl buffer
    366   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
    367 
    368   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
    369     // Don't make a local copy of the kernel unless we have to (i.e. it's an
    370     // expression that needs to be evaluated)
    371     typename KernelStorage::Type in_place = m_kernelImpl.data();
    372     if (in_place) {
    373       m_kernel = in_place;
    374       m_local_kernel = false;
    375     } else {
    376       ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
    377       EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
    378       typedef TensorEvalToOp<const KernelArgType> EvalTo;
    379       EvalTo evalToTmp(m_device.get(local), m_kernelArg);
    380       const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
    381       internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
    382       m_kernel = local;
    383       m_local_kernel = true;
    384     }
    385   }
    386 
    387   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
    388     typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
    389     typedef typename InputEvaluator::Dimensions InputDims;
    390     switch (NumKernelDims) {
    391       case 1: {
    392         const size_t numX = dimensions()[m_indices[0]];
    393         const size_t numP = dimensions().TotalSize() / numX;
    394         const auto input_dim = std::array<size_t, 2>{numX, numP};
    395         auto global_range = cl::sycl::range<2>{};
    396         auto local_range = cl::sycl::range<2>{};
    397         const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
    398 
    399         m_device.parallel_for_setup(input_dim, global_range, local_range);
    400         const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
    401         gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
    402         const array<Index, 1> indices{{m_indices[0]}};
    403         const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
    404         internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
    405 
    406         typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
    407                                        typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
    408             ConvKernel;
    409 
    410         m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
    411             m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
    412             indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
    413         break;
    414       }
    415 
    416       case 2: {
    417         auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
    418                                                   static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
    419         auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
    420                                               (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
    421         const size_t numX = dimensions()[m_indices[kernel_index[0]]];
    422         const size_t numY = dimensions()[m_indices[kernel_index[1]]];
    423         const size_t numP = dimensions().TotalSize() / (numX * numY);
    424         auto input_dim = std::array<size_t, 3>{numX, numY, numP};
    425 
    426         auto global_range = cl::sycl::range<3>{};
    427         auto local_range = cl::sycl::range<3>{};
    428 
    429         m_device.parallel_for_setup(input_dim, global_range, local_range);
    430 
    431         const size_t local_memory_size =
    432             (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
    433         gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
    434         const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
    435         const array<Index, 2> kernel_dims{
    436             {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
    437         internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
    438         typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
    439                                        typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
    440             ConvKernel;
    441         m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
    442             m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
    443             indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
    444         break;
    445       }
    446 
    447       case 3: {
    448         auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
    449                                                   static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
    450                                                   static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
    451 
    452         auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
    453                                               (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
    454                                               (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
    455 
    456         const size_t numX = dimensions()[m_indices[kernel_index[0]]];
    457         const size_t numY = dimensions()[m_indices[kernel_index[1]]];
    458         const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
    459         auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
    460         const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
    461 
    462         const array<Index, 3> indices{
    463             {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
    464         const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
    465                                            m_kernelImpl.dimensions()[kernel_index[1]],
    466                                            m_kernelImpl.dimensions()[kernel_index[2]]}};
    467 
    468         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
    469 
    470         auto global_range = cl::sycl::range<3>{};
    471         auto local_range = cl::sycl::range<3>{};
    472 
    473         m_device.parallel_for_setup(input_dim, global_range, local_range);
    474         auto local_memory_range = (local_range + kernel_size - 1);
    475         const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
    476 
    477         gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
    478         typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
    479                                        typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
    480             ConvKernel;
    481         m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
    482             m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
    483             indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
    484         break;
    485       }
    486 
    487       default: {
    488         EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
    489                             THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
    490       }
    491     }
    492   }
    493 
    494   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
    495     eigen_assert(m_buf != NULL);
    496     eigen_assert(index < m_dimensions.TotalSize());
    497     return m_buf[index];
    498   }
    499 
    500   template <int LoadMode>
    501   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
    502     eigen_assert(m_buf != NULL);
    503     eigen_assert(index < m_dimensions.TotalSize());
    504     return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
    505   }
    506 
    507   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
    508     // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
    509     // model.
    510     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
    511     // We ignore the use of fused multiply-add.
    512     const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
    513     const double firstIndex_compute_cost =
    514         NumDims *
    515         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
    516     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
    517            kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
    518                           TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
    519   }
    520   // binding placeholder accessors to a command group handler for SYCL
    521   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    522     m_kernelImpl.bind(cgh);
    523     m_inputImpl.bind(cgh);
    524     m_buf.bind(cgh);
    525     m_kernel.bind(cgh);
    526   }
    527 
    528  private:
    529   // No assignment (copies are needed by the kernels)
    530   TensorEvaluator &operator=(const TensorEvaluator &);
    531   TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
    532   KernelArgType m_kernelArg;
    533   TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
    534   Indices m_indices;
    535   Dimensions m_dimensions;
    536   EvaluatorPointerType m_buf;
    537   typename KernelStorage::Type m_kernel;
    538   bool m_local_kernel;
    539   const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
    540 };  // namespace Eigen
    541 
    542 }  // end namespace Eigen
    543 
    544 #endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H