cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorConvolution.h (48686B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
     11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
     12 
     13 namespace Eigen {
     14 
     15 /** \class TensorConvolution
     16   * \ingroup CXX11_Tensor_Module
     17   *
     18   * \brief Tensor convolution class.
     19   *
     20   *
     21   */
     22 namespace internal {
     23 
     24 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
     25 class IndexMapper {
     26  public:
     27   IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
     28               const array<Index, NumKernelDims>& indices) {
     29 
     30     array<Index, NumDims> dimensions = input_dims;
     31     for (int i = 0; i < NumKernelDims; ++i) {
     32       const Index index = indices[i];
     33       const Index input_dim = input_dims[index];
     34       const Index kernel_dim = kernel_dims[i];
     35       const Index result_dim = input_dim - kernel_dim + 1;
     36       dimensions[index] = result_dim;
     37     }
     38 
     39     array<Index, NumDims> inputStrides;
     40     array<Index, NumDims> outputStrides;
     41     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
     42       inputStrides[0] = 1;
     43       outputStrides[0] = 1;
     44       for (int i = 1; i < NumDims; ++i) {
     45         inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
     46         outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
     47       }
     48     } else {
     49       inputStrides[NumDims - 1] = 1;
     50       outputStrides[NumDims - 1] = 1;
     51       for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
     52         inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
     53         outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
     54       }
     55     }
     56 
     57     array<Index, NumDims> gpuInputDimensions;
     58     array<Index, NumDims> gpuOutputDimensions;
     59     array<Index, NumDims> tmp = dimensions;
     60     array<Index, NumDims> ordering;
     61     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
     62                               ? 0
     63                               : NumDims - NumKernelDims;
     64     for (int i = 0; i < NumKernelDims; ++i) {
     65       const Index index = i + offset;
     66       ordering[index] = indices[i];
     67       tmp[indices[i]] = -1;
     68       gpuInputDimensions[index] = input_dims[indices[i]];
     69       gpuOutputDimensions[index] = dimensions[indices[i]];
     70     }
     71 
     72     int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
     73                       ? NumKernelDims
     74                       : 0;
     75     for (int i = 0; i < NumDims; ++i) {
     76       if (tmp[i] >= 0) {
     77         ordering[written] = i;
     78         gpuInputDimensions[written] = input_dims[i];
     79         gpuOutputDimensions[written] = dimensions[i];
     80         ++written;
     81       }
     82     }
     83 
     84     for (int i = 0; i < NumDims; ++i) {
     85       m_inputStrides[i] = inputStrides[ordering[i]];
     86       m_outputStrides[i] = outputStrides[ordering[i]];
     87     }
     88 
     89     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
     90       for (int i = 0; i < NumDims; ++i) {
     91         if (i > NumKernelDims) {
     92           m_gpuInputStrides[i] =
     93               m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
     94           m_gpuOutputStrides[i] =
     95               m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
     96         } else {
     97           m_gpuInputStrides[i] = 1;
     98           m_gpuOutputStrides[i] = 1;
     99         }
    100       }
    101     } else {
    102       for (int i = NumDims - 1; i >= 0; --i) {
    103         if (static_cast<size_t>(i + 1) < offset) {
    104           m_gpuInputStrides[i] =
    105               m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
    106           m_gpuOutputStrides[i] =
    107               m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
    108         } else {
    109           m_gpuInputStrides[i] = 1;
    110           m_gpuOutputStrides[i] = 1;
    111         }
    112       }
    113     }
    114   }
    115 
    116   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
    117     Index inputIndex = 0;
    118     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    119       for (int d = NumDims - 1; d > NumKernelDims; --d) {
    120         const Index idx = p / m_gpuInputStrides[d];
    121         inputIndex += idx * m_inputStrides[d];
    122         p -= idx * m_gpuInputStrides[d];
    123       }
    124       inputIndex += p * m_inputStrides[NumKernelDims];
    125     } else {
    126       std::ptrdiff_t limit = 0;
    127       if (NumKernelDims < NumDims) {
    128         limit = NumDims - NumKernelDims - 1;
    129       }
    130       for (int d = 0; d < limit; ++d) {
    131         const Index idx = p / m_gpuInputStrides[d];
    132         inputIndex += idx * m_inputStrides[d];
    133         p -= idx * m_gpuInputStrides[d];
    134       }
    135       inputIndex += p * m_inputStrides[limit];
    136     }
    137     return inputIndex;
    138   }
    139 
    140   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
    141     Index outputIndex = 0;
    142     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    143       for (int d = NumDims - 1; d > NumKernelDims; --d) {
    144         const Index idx = p / m_gpuOutputStrides[d];
    145         outputIndex += idx * m_outputStrides[d];
    146         p -= idx * m_gpuOutputStrides[d];
    147       }
    148       outputIndex += p * m_outputStrides[NumKernelDims];
    149     } else {
    150       std::ptrdiff_t limit = 0;
    151       if (NumKernelDims < NumDims) {
    152         limit = NumDims - NumKernelDims - 1;
    153       }
    154       for (int d = 0; d < limit; ++d) {
    155         const Index idx = p / m_gpuOutputStrides[d];
    156         outputIndex += idx * m_outputStrides[d];
    157         p -= idx * m_gpuOutputStrides[d];
    158       }
    159       outputIndex += p * m_outputStrides[limit];
    160     }
    161     return outputIndex;
    162   }
    163 
    164   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
    165     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    166                               ? 0
    167                               : NumDims - NumKernelDims;
    168     return i * m_inputStrides[offset];
    169   }
    170 
    171   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
    172     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    173                               ? 0
    174                               : NumDims - NumKernelDims;
    175     return i * m_outputStrides[offset];
    176   }
    177 
    178   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
    179     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    180                               ? 0
    181                               : NumDims - NumKernelDims;
    182     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
    183   }
    184 
    185   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
    186     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    187                               ? 0
    188                               : NumDims - NumKernelDims;
    189     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
    190   }
    191 
    192   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
    193     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    194                               ? 0
    195                               : NumDims - NumKernelDims;
    196     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
    197            k * m_inputStrides[offset + 2];
    198   }
    199 
    200   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
    201     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
    202                               ? 0
    203                               : NumDims - NumKernelDims;
    204     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
    205            k * m_outputStrides[offset + 2];
    206   }
    207 
    208  private:
    209   static const int NumDims = internal::array_size<InputDims>::value;
    210   array<Index, NumDims> m_inputStrides;
    211   array<Index, NumDims> m_outputStrides;
    212   array<Index, NumDims> m_gpuInputStrides;
    213   array<Index, NumDims> m_gpuOutputStrides;
    214 };
    215 
    216 
    217 
    218 template<typename Dimensions, typename InputXprType, typename KernelXprType>
    219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
    220 {
    221   // Type promotion to handle the case where the types of the lhs and the rhs are different.
    222   typedef typename promote_storage_type<typename InputXprType::Scalar,
    223                                         typename KernelXprType::Scalar>::ret Scalar;
    224   typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
    225                                         typename traits<KernelXprType>::StorageKind>::ret StorageKind;
    226   typedef typename promote_index_type<typename traits<InputXprType>::Index,
    227                                       typename traits<KernelXprType>::Index>::type Index;
    228   typedef typename InputXprType::Nested LhsNested;
    229   typedef typename KernelXprType::Nested RhsNested;
    230   typedef typename remove_reference<LhsNested>::type _LhsNested;
    231   typedef typename remove_reference<RhsNested>::type _RhsNested;
    232   static const int NumDimensions = traits<InputXprType>::NumDimensions;
    233   static const int Layout = traits<InputXprType>::Layout;
    234   typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
    235   typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
    236 
    237   enum {
    238     Flags = 0
    239   };
    240 };
    241 
    242 template<typename Dimensions, typename InputXprType, typename KernelXprType>
    243 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
    244 {
    245   typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
    246 };
    247 
    248 template<typename Dimensions, typename InputXprType, typename KernelXprType>
    249 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
    250 {
    251   typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
    252 };
    253 
    254 }  // end namespace internal
    255 
    256 
    257 
    258 template<typename Indices, typename InputXprType, typename KernelXprType>
    259 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
    260 {
    261   public:
    262   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
    263   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
    264   typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
    265                                                   typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
    266   typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
    267   typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
    268   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
    269 
    270   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
    271       : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
    272 
    273     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    274     const Indices& indices() const { return m_indices; }
    275 
    276     /** \returns the nested expressions */
    277     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    278     const typename internal::remove_all<typename InputXprType::Nested>::type&
    279     inputExpression() const { return m_input_xpr; }
    280 
    281     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    282     const typename internal::remove_all<typename KernelXprType::Nested>::type&
    283     kernelExpression() const { return m_kernel_xpr; }
    284 
    285   protected:
    286     typename InputXprType::Nested m_input_xpr;
    287     typename KernelXprType::Nested m_kernel_xpr;
    288     const Indices m_indices;
    289 };
    290 
    291 
    292 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
    293 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
    294 {
    295   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
    296 
    297   static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
    298   static const int NumKernelDims = internal::array_size<Indices>::value;
    299   typedef typename XprType::Index Index;
    300   typedef DSizes<Index, NumDims> Dimensions;
    301 
    302   typedef typename XprType::Scalar Scalar;
    303   typedef typename XprType::CoeffReturnType CoeffReturnType;
    304   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
    305   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
    306   typedef StorageMemory<Scalar, Device> Storage;
    307   typedef typename Storage::Type EvaluatorPointerType;
    308 
    309   enum {
    310     IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
    311     PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
    312     BlockAccess = false,
    313     PreferBlockAccess = false,
    314     Layout = TensorEvaluator<InputArgType, Device>::Layout,
    315     CoordAccess = false,  // to be implemented
    316     RawAccess = false
    317   };
    318 
    319   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
    320   typedef internal::TensorBlockNotImplemented TensorBlock;
    321   //===--------------------------------------------------------------------===//
    322 
    323   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    324       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
    325   {
    326     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
    327 
    328     const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
    329     const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
    330 
    331     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    332       m_inputStride[0] = 1;
    333       for (int i = 1; i < NumDims; ++i) {
    334         m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
    335       }
    336     } else {
    337       m_inputStride[NumDims - 1] = 1;
    338       for (int i = NumDims - 2; i >= 0; --i) {
    339         m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
    340       }
    341     }
    342 
    343     m_dimensions = m_inputImpl.dimensions();
    344     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    345       for (int i = 0; i < NumKernelDims; ++i) {
    346         const Index index = op.indices()[i];
    347         const Index input_dim = input_dims[index];
    348         const Index kernel_dim = kernel_dims[i];
    349         const Index result_dim = input_dim - kernel_dim + 1;
    350         m_dimensions[index] = result_dim;
    351         if (i > 0) {
    352           m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
    353         } else {
    354           m_kernelStride[0] = 1;
    355         }
    356         m_indexStride[i] = m_inputStride[index];
    357       }
    358 
    359       m_outputStride[0] = 1;
    360       for (int i = 1; i < NumDims; ++i) {
    361         m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
    362       }
    363     } else {
    364       for (int i = NumKernelDims - 1; i >= 0; --i) {
    365         const Index index = op.indices()[i];
    366         const Index input_dim = input_dims[index];
    367         const Index kernel_dim = kernel_dims[i];
    368         const Index result_dim = input_dim - kernel_dim + 1;
    369         m_dimensions[index] = result_dim;
    370         if (i < NumKernelDims - 1) {
    371           m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
    372         } else {
    373           m_kernelStride[NumKernelDims - 1] = 1;
    374         }
    375         m_indexStride[i] = m_inputStride[index];
    376       }
    377 
    378       m_outputStride[NumDims - 1] = 1;
    379       for (int i = NumDims - 2; i >= 0; --i) {
    380         m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
    381       }
    382     }
    383   }
    384 
    385   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
    386 
    387   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
    388     m_inputImpl.evalSubExprsIfNeeded(NULL);
    389     preloadKernel();
    390     return true;
    391   }
    392   EIGEN_STRONG_INLINE void cleanup() {
    393     m_inputImpl.cleanup();
    394     if (m_local_kernel) {
    395       m_device.deallocate((void*)m_kernel);
    396       m_local_kernel = false;
    397     }
    398     m_kernel = NULL;
    399   }
    400 
    401   void evalTo(typename XprType::Scalar* buffer) {
    402     evalSubExprsIfNeeded(NULL);
    403     for (int i = 0; i < dimensions().TotalSize(); ++i) {
    404       buffer[i] += coeff(i);
    405     }
    406     cleanup();
    407   }
    408 
    409   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
    410   {
    411     CoeffReturnType result = CoeffReturnType(0);
    412     convolve(firstInput(index), 0, NumKernelDims-1, result);
    413     return result;
    414   }
    415 
    416   template<int LoadMode>
    417   EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
    418   {
    419     Index indices[2] = {index, index+PacketSize-1};
    420     Index startInputs[2] = {0, 0};
    421     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    422       for (int i = NumDims - 1; i > 0; --i) {
    423         const Index idx0 = indices[0] / m_outputStride[i];
    424         const Index idx1 = indices[1] / m_outputStride[i];
    425         startInputs[0] += idx0 * m_inputStride[i];
    426         startInputs[1] += idx1 * m_inputStride[i];
    427         indices[0] -= idx0 * m_outputStride[i];
    428         indices[1] -= idx1 * m_outputStride[i];
    429       }
    430     } else {
    431       for (int i = 0; i < NumDims - 1; ++i) {
    432         const Index idx0 = indices[0] / m_outputStride[i];
    433         const Index idx1 = indices[1] / m_outputStride[i];
    434         startInputs[0] += idx0 * m_inputStride[i];
    435         startInputs[1] += idx1 * m_inputStride[i];
    436         indices[0] -= idx0 * m_outputStride[i];
    437         indices[1] -= idx1 * m_outputStride[i];
    438       }
    439     }
    440     startInputs[0] += indices[0];
    441     startInputs[1] += indices[1];
    442 
    443     if (startInputs[1]-startInputs[0] == PacketSize-1) {
    444       PacketReturnType result = internal::pset1<PacketReturnType>(0);
    445       convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
    446       return result;
    447     } else {
    448       EIGEN_ALIGN_MAX Scalar data[PacketSize];
    449       data[0] = Scalar(0);
    450       convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
    451       for (int i = 1; i < PacketSize-1; ++i) {
    452         data[i] = Scalar(0);
    453         convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
    454       }
    455       data[PacketSize-1] = Scalar(0);
    456       convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
    457       return internal::pload<PacketReturnType>(data);
    458     }
    459   }
    460 
    461   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
    462   costPerCoeff(bool vectorized) const {
    463     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
    464     // We ignore the use of fused multiply-add.
    465     const double convolve_compute_cost =
    466         TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
    467     const double firstIndex_compute_cost =
    468         NumDims *
    469         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
    470          TensorOpCost::DivCost<Index>());
    471     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
    472            kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
    473                           m_kernelImpl.costPerCoeff(vectorized) +
    474                           TensorOpCost(0, 0, convolve_compute_cost, vectorized,
    475                                        PacketSize));
    476   }
    477 
    478   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
    479 
    480  private:
    481   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
    482     Index startInput = 0;
    483     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    484       for (int i = NumDims - 1; i > 0; --i) {
    485         const Index idx = index / m_outputStride[i];
    486         startInput += idx * m_inputStride[i];
    487         index -= idx * m_outputStride[i];
    488       }
    489     } else {
    490       for (int i = 0; i < NumDims - 1; ++i) {
    491         const Index idx = index / m_outputStride[i];
    492         startInput += idx * m_inputStride[i];
    493         index -= idx * m_outputStride[i];
    494       }
    495     }
    496     startInput += index;
    497     return startInput;
    498   }
    499 
    500   EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
    501     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
    502       const Index input = firstIndex + j * m_indexStride[DimIndex];
    503       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
    504       if (DimIndex > 0) {
    505         convolve(input, kernel, DimIndex-1, accum);
    506       } else {
    507         accum += m_inputImpl.coeff(input) * m_kernel[kernel];
    508       }
    509     }
    510   }
    511 
    512   template <typename Packet>
    513   EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
    514     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
    515       const Index input = firstIndex + j * m_indexStride[DimIndex];
    516       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
    517       if (DimIndex > 0) {
    518         convolvePacket(input, kernel, DimIndex-1, accum);
    519       } else {
    520         accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
    521       }
    522     }
    523   }
    524 
    525   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
    526     // Don't make a local copy of the kernel unless we have to (i.e. it's an
    527     // expression that needs to be evaluated)
    528     const Scalar* in_place = m_kernelImpl.data();
    529     if (in_place) {
    530       m_kernel = in_place;
    531       m_local_kernel = false;
    532     } else {
    533       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
    534       Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
    535       typedef TensorEvalToOp<const KernelArgType> EvalTo;
    536       EvalTo evalToTmp(local, m_kernelArg);
    537       const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
    538       internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
    539 
    540       m_kernel = local;
    541       m_local_kernel = true;
    542     }
    543   }
    544 
    545   array<Index, NumDims> m_inputStride;
    546   array<Index, NumDims> m_outputStride;
    547 
    548   array<Index, NumKernelDims> m_indexStride;
    549   array<Index, NumKernelDims> m_kernelStride;
    550   TensorEvaluator<InputArgType, Device> m_inputImpl;
    551   TensorEvaluator<KernelArgType, Device> m_kernelImpl;
    552   Dimensions m_dimensions;
    553 
    554   KernelArgType m_kernelArg;
    555   const Scalar* m_kernel;
    556   bool m_local_kernel;
    557   const Device EIGEN_DEVICE_REF m_device;
    558 };
    559 
    560 
    561 
    562 
    563 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
    564 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
    565 
    566 template <int StaticKernelSize>
    567 struct GetKernelSize {
    568   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
    569     return StaticKernelSize;
    570   }
    571 };
    572 template <>
    573 struct GetKernelSize<Dynamic> {
    574   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
    575     return kernelSize;
    576   }
    577 };
    578 
    579 template <typename InputEvaluator, typename Index, typename InputDims,
    580           int StaticKernelSize>
    581 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
    582     InputEvaluator eval,
    583     const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
    584         indexMapper,
    585     const float* __restrict kernel, const int numPlanes, const int numX,
    586     const int maxX, const int kernelSize, float* buffer) {
    587 #if defined(EIGEN_HIPCC)
    588   HIP_DYNAMIC_SHARED(float, s)
    589 #else
    590   extern __shared__ float s[];
    591 #endif
    592 
    593   const int first_x = blockIdx.x * maxX;
    594   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
    595   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
    596   const int num_x_output = last_x - first_x + 1;
    597 
    598   const int first_plane = blockIdx.y * blockDim.y;
    599   const int plane_stride = blockDim.y * gridDim.y;
    600 
    601   for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
    602     // Load inputs to shared memory
    603     const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
    604     const int plane_kernel_offset = threadIdx.y * num_x_input;
    605     #pragma unroll
    606     for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
    607       const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
    608       s[i + plane_kernel_offset] = eval.coeff(tensor_index);
    609     }
    610 
    611     __syncthreads();
    612 
    613     // Compute the convolution
    614     const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
    615 
    616     #pragma unroll
    617     for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
    618       const int kernel_offset = plane_kernel_offset + i;
    619       float result = 0.0f;
    620       #pragma unroll
    621       for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
    622         result += s[k + kernel_offset] * kernel[k];
    623       }
    624       const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
    625       buffer[tensor_index] = result;
    626     }
    627     __syncthreads();
    628   }
    629 };
    630 
    631 template <typename InputEvaluator, typename Index, typename InputDims,
    632           int StaticKernelSizeX, int StaticKernelSizeY>
    633 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
    634     InputEvaluator eval,
    635     const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
    636         indexMapper,
    637     const float* __restrict kernel, const int numPlanes, const int numX,
    638     const int maxX, const int numY, const int maxY, const int kernelSizeX,
    639     const int kernelSizeY, float* buffer) {
    640 #if defined(EIGEN_HIPCC)
    641   HIP_DYNAMIC_SHARED(float, s)
    642 #else
    643   extern __shared__ float s[];
    644 #endif
    645 
    646   const int first_x = blockIdx.x * maxX;
    647   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
    648   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
    649   const int num_x_output = last_x - first_x + 1;
    650 
    651   const int first_y = blockIdx.y * maxY;
    652   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
    653   const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
    654   const int num_y_output = last_y - first_y + 1;
    655 
    656   const int first_plane = blockIdx.z * blockDim.z;
    657   const int plane_stride = blockDim.z * gridDim.z;
    658 
    659   for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
    660 
    661     const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
    662     const int plane_kernel_offset = threadIdx.z * num_y_input;
    663 
    664     // Load inputs to shared memory
    665     #pragma unroll
    666     for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
    667       const int input_offset = num_x_input * (j + plane_kernel_offset);
    668       #pragma unroll
    669       for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
    670         const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
    671         s[i + input_offset] = eval.coeff(tensor_index);
    672       }
    673     }
    674 
    675     __syncthreads();
    676 
    677     // Convolution
    678     const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
    679 
    680     #pragma unroll
    681     for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
    682       #pragma unroll
    683       for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
    684         float result = 0.0f;
    685         #pragma unroll
    686         for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
    687           const int kernel_offset = kernelSizeX * l;
    688           const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
    689           #pragma unroll
    690           for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
    691             result += s[k + input_offset] * kernel[k + kernel_offset];
    692           }
    693         }
    694         const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
    695         buffer[tensor_index] = result;
    696       }
    697     }
    698 
    699     __syncthreads();
    700   }
    701 };
    702 
    703 template <typename InputEvaluator, typename Index, typename InputDims>
    704 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
    705     InputEvaluator eval,
    706     const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
    707         indexMapper,
    708     const float* __restrict kernel, const size_t numPlanes, const size_t numX,
    709     const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
    710     const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
    711     const size_t kernelSizeZ, float* buffer) {
    712 #if defined(EIGEN_HIPCC)
    713   HIP_DYNAMIC_SHARED(float, s)
    714 #else
    715   extern __shared__ float s[];
    716 #endif
    717 
    718   // Load inputs to shared memory
    719   const int first_x = blockIdx.x * maxX;
    720   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
    721   const int num_x_input = last_x - first_x + kernelSizeX;
    722 
    723   const int first_y = blockIdx.y * maxY;
    724   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
    725   const int num_y_input = last_y - first_y + kernelSizeY;
    726 
    727   const int first_z = blockIdx.z * maxZ;
    728   const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
    729   const int num_z_input = last_z - first_z + kernelSizeZ;
    730 
    731   for (int p = 0; p < numPlanes; ++p) {
    732 
    733     const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
    734     const int plane_kernel_offset = 0;
    735 
    736     for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
    737       for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
    738         for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
    739           const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
    740           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
    741         }
    742       }
    743     }
    744 
    745     __syncthreads();
    746 
    747     // Convolution
    748     const int num_z_output = last_z - first_z + 1;
    749     const int num_y_output = last_y - first_y + 1;
    750     const int num_x_output = last_x - first_x + 1;
    751     const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
    752 
    753     for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
    754       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
    755         for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
    756           float result = 0.0f;
    757           for (int n = 0; n < kernelSizeZ; ++n) {
    758             for (int m = 0; m < kernelSizeY; ++m) {
    759               for (int l = 0; l < kernelSizeX; ++l) {
    760                 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
    761               }
    762             }
    763           }
    764           const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
    765           buffer[tensor_index] = result;
    766         }
    767       }
    768     }
    769     __syncthreads();
    770   }
    771 };
    772 
    773 
    774 
    775 template<typename Indices, typename InputArgType, typename KernelArgType>
    776 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
    777 {
    778   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
    779 
    780   static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
    781   static const int NumKernelDims = internal::array_size<Indices>::value;
    782   typedef typename XprType::Index Index;
    783   typedef DSizes<Index, NumDims> Dimensions;
    784   typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
    785 
    786   enum {
    787     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
    788     PacketAccess = false,
    789     BlockAccess = false,
    790     PreferBlockAccess = false,
    791     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
    792     CoordAccess = false,  // to be implemented
    793     RawAccess = false
    794   };
    795 
    796   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
    797   typedef internal::TensorBlockNotImplemented TensorBlock;
    798   //===--------------------------------------------------------------------===//
    799 
    800   TensorEvaluator(const XprType& op, const GpuDevice& device)
    801       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
    802   {
    803     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
    804 
    805     const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
    806     const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
    807 
    808     m_dimensions = m_inputImpl.dimensions();
    809     for (int i = 0; i < NumKernelDims; ++i) {
    810       const Index index = op.indices()[i];
    811       const Index input_dim = input_dims[index];
    812       const Index kernel_dim = kernel_dims[i];
    813       const Index result_dim = input_dim - kernel_dim + 1;
    814       m_dimensions[index] = result_dim;
    815     }
    816   }
    817 
    818   typedef typename XprType::CoeffReturnType CoeffReturnType;
    819   typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
    820   typedef typename InputArgType::Scalar Scalar;
    821   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
    822 
    823   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
    824 
    825   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
    826     preloadKernel();
    827     m_inputImpl.evalSubExprsIfNeeded(NULL);
    828     if (data) {
    829       executeEval(data);
    830       return false;
    831     } else {
    832       m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
    833       executeEval(m_buf);
    834       return true;
    835     }
    836   }
    837 
    838   EIGEN_STRONG_INLINE void cleanup() {
    839     m_inputImpl.cleanup();
    840     if (m_buf) {
    841       m_device.deallocate(m_buf);
    842       m_buf = NULL;
    843     }
    844     if (m_local_kernel) {
    845       m_device.deallocate((void*)m_kernel);
    846       m_local_kernel = false;
    847     }
    848     m_kernel = NULL;
    849   }
    850 
    851   EIGEN_STRONG_INLINE void preloadKernel() {
    852     // Don't make a local copy of the kernel unless we have to (i.e. it's an
    853     // expression that needs to be evaluated)
    854     const Scalar* in_place = m_kernelImpl.data();
    855     if (in_place) {
    856       m_kernel = in_place;
    857       m_local_kernel = false;
    858     } else {
    859       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
    860       Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
    861       typedef TensorEvalToOp<const KernelArgType> EvalTo;
    862       EvalTo evalToTmp(local, m_kernelArg);
    863       const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
    864       internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
    865 
    866       m_kernel = local;
    867       m_local_kernel = true;
    868     }
    869   }
    870 
    871   static unsigned int ceil(unsigned int num, unsigned int denom) {
    872     const unsigned int rounded_toward_zero = num / denom;
    873     if (num > rounded_toward_zero * denom) {
    874       return rounded_toward_zero + 1;
    875     }
    876     return rounded_toward_zero;
    877   }
    878 
    879   void executeEval(Scalar* data) const {
    880     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
    881 
    882     const int maxSharedMem = m_device.sharedMemPerBlock();
    883     const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
    884     const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
    885     const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
    886     const int warpSize = 32;
    887 
    888     switch (NumKernelDims) {
    889       case 1: {
    890         const int kernel_size = m_kernelImpl.dimensions().TotalSize();
    891 
    892         const int numX = dimensions()[m_indices[0]];
    893         const int numP = dimensions().TotalSize() / numX;
    894         int maxX;
    895         dim3 block_size;
    896 
    897         const int single_stride_dim =
    898             static_cast<int>(Layout) == static_cast<int>(ColMajor)
    899                 ? 0
    900                 : m_inputImpl.dimensions().rank() - 1;
    901         if (m_indices[0] == single_stride_dim) {
    902           // Maximum the reuse
    903           const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
    904           maxX = numext::mini<int>(inner_dim, numX);
    905           const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
    906           block_size.x = numext::mini(maxThreadsPerBlock, maxX);
    907           block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
    908         }
    909         else {
    910           // Read as much as possible alongside the inner most dimension, that is the plane
    911           const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
    912           const int maxP = numext::mini<int>(inner_dim, numP);
    913           maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
    914 
    915           block_size.x = numext::mini(warpSize, maxX);
    916           block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
    917         }
    918 
    919         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
    920         gpu_assert(shared_mem <= maxSharedMem);
    921 
    922         const int num_x_blocks = ceil(numX, maxX);
    923         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
    924         const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
    925 
    926         dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
    927 
    928 
    929         //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
    930 
    931         const array<Index, 1> indices(m_indices[0]);
    932         const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
    933         internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
    934             m_inputImpl.dimensions(), kernel_dims, indices);
    935         switch(kernel_size) {
    936           case 4: {
    937             LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
    938             break;
    939           }
    940           case 7: {
    941             LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
    942             break;
    943           }
    944           default: {
    945             LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
    946           }
    947         }
    948         break;
    949       }
    950 
    951       case 2: {
    952         const int idxX =
    953             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
    954         const int idxY =
    955             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
    956         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
    957         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
    958 
    959         const int numX = dimensions()[m_indices[idxX]];
    960         const int numY = dimensions()[m_indices[idxY]];
    961         const int numP = dimensions().TotalSize() / (numX*numY);
    962 
    963         const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
    964 
    965         // Snap maxX to warp size
    966         int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
    967         const int maxX = numext::mini<int>(inner_dim, numX);
    968         const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
    969         const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
    970 
    971         dim3 block_size;
    972         block_size.x = numext::mini(1024, maxX);
    973         block_size.y = numext::mini<int>(1024/block_size.x, maxY);
    974         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
    975 
    976         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
    977         gpu_assert(shared_mem <= maxSharedMem);
    978 
    979         const int num_x_blocks = ceil(numX, maxX);
    980         const int num_y_blocks = ceil(numY, maxY);
    981         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
    982         const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
    983 
    984         dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
    985 
    986 
    987         //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
    988 
    989         const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
    990         const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
    991                                           m_kernelImpl.dimensions()[idxY]);
    992         internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
    993             m_inputImpl.dimensions(), kernel_dims, indices);
    994         switch (kernel_size_x) {
    995           case 4: {
    996             switch (kernel_size_y) {
    997               case 7: {
    998                 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
    999                 break;
   1000               }
   1001               default: {
   1002                 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
   1003                 break;
   1004               }
   1005             }
   1006             break;
   1007           }
   1008           case 7: {
   1009             switch (kernel_size_y) {
   1010               case 4: {
   1011                 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
   1012                 break;
   1013               }
   1014               default: {
   1015                 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
   1016                 break;
   1017               }
   1018             }
   1019             break;
   1020           }
   1021           default: {
   1022             LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
   1023             break;
   1024           }
   1025         }
   1026         break;
   1027       }
   1028 
   1029       case 3: {
   1030         const int idxX =
   1031             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
   1032         const int idxY =
   1033             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
   1034         const int idxZ =
   1035             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
   1036 
   1037         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
   1038         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
   1039         const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
   1040 
   1041         const int numX = dimensions()[m_indices[idxX]];
   1042         const int numY = dimensions()[m_indices[idxY]];
   1043         const int numZ = dimensions()[m_indices[idxZ]];
   1044         const int numP = dimensions().TotalSize() / (numX*numY*numZ);
   1045 
   1046         const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
   1047         const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
   1048         const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
   1049 
   1050         dim3 block_size;
   1051         block_size.x = numext::mini(32, maxX);
   1052         block_size.y = numext::mini(32, maxY);
   1053         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
   1054         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
   1055 
   1056         const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
   1057         gpu_assert(shared_mem <= maxSharedMem);
   1058 
   1059         //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
   1060         const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
   1061                                       m_indices[idxZ]);
   1062         const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
   1063                                           m_kernelImpl.dimensions()[idxY],
   1064                                           m_kernelImpl.dimensions()[idxZ]);
   1065         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
   1066             m_inputImpl.dimensions(), kernel_dims, indices);
   1067 
   1068         LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
   1069         break;
   1070       }
   1071 
   1072       default: {
   1073         EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
   1074       }
   1075     }
   1076   }
   1077 
   1078   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   1079   {
   1080     eigen_assert(m_buf);
   1081     eigen_assert(index < m_dimensions.TotalSize());
   1082     return m_buf[index];
   1083   }
   1084 
   1085   template<int LoadMode>
   1086   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
   1087   {
   1088     eigen_assert(m_buf);
   1089     eigen_assert(index < m_dimensions.TotalSize());
   1090     return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
   1091   }
   1092 
   1093   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   1094   costPerCoeff(bool vectorized) const {
   1095     // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
   1096     // model.
   1097     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
   1098     // We ignore the use of fused multiply-add.
   1099     const double convolve_compute_cost =
   1100         TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
   1101     const double firstIndex_compute_cost =
   1102         NumDims *
   1103         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
   1104          TensorOpCost::DivCost<Index>());
   1105     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
   1106            kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
   1107                           m_kernelImpl.costPerCoeff(vectorized) +
   1108                           TensorOpCost(0, 0, convolve_compute_cost, vectorized,
   1109                                        PacketSize));
   1110   }
   1111 
   1112  private:
   1113   // No assignment (copies are needed by the kernels)
   1114   TensorEvaluator& operator = (const TensorEvaluator&);
   1115 
   1116   TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
   1117   TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
   1118   KernelArgType m_kernelArg;
   1119   Indices m_indices;
   1120   Dimensions m_dimensions;
   1121   Scalar* m_buf;
   1122   const Scalar* m_kernel;
   1123   bool m_local_kernel;
   1124 
   1125   const GpuDevice& m_device;
   1126 };
   1127 #endif
   1128 
   1129 
   1130 } // end namespace Eigen
   1131 
   1132 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H