cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorPadding.h (28764B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
     11 #define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
     12 
     13 namespace Eigen {
     14 
     15 /** \class TensorPadding
     16   * \ingroup CXX11_Tensor_Module
     17   *
     18   * \brief Tensor padding class.
     19   * At the moment only padding with a constant value is supported.
     20   *
     21   */
     22 namespace internal {
     23 template<typename PaddingDimensions, typename XprType>
     24 struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType>
     25 {
     26   typedef typename XprType::Scalar Scalar;
     27   typedef traits<XprType> XprTraits;
     28   typedef typename XprTraits::StorageKind StorageKind;
     29   typedef typename XprTraits::Index Index;
     30   typedef typename XprType::Nested Nested;
     31   typedef typename remove_reference<Nested>::type _Nested;
     32   static const int NumDimensions = XprTraits::NumDimensions;
     33   static const int Layout = XprTraits::Layout;
     34   typedef typename XprTraits::PointerType PointerType;
     35 };
     36 
     37 template<typename PaddingDimensions, typename XprType>
     38 struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense>
     39 {
     40   typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
     41 };
     42 
     43 template<typename PaddingDimensions, typename XprType>
     44 struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type>
     45 {
     46   typedef TensorPaddingOp<PaddingDimensions, XprType> type;
     47 };
     48 
     49 }  // end namespace internal
     50 
     51 
     52 
     53 template<typename PaddingDimensions, typename XprType>
     54 class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors>
     55 {
     56   public:
     57   typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
     58   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
     59   typedef typename XprType::CoeffReturnType CoeffReturnType;
     60   typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
     61   typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
     62   typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
     63 
     64   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value)
     65       : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {}
     66 
     67     EIGEN_DEVICE_FUNC
     68     const PaddingDimensions& padding() const { return m_padding_dims; }
     69     EIGEN_DEVICE_FUNC
     70     Scalar padding_value() const { return m_padding_value; }
     71 
     72     EIGEN_DEVICE_FUNC
     73     const typename internal::remove_all<typename XprType::Nested>::type&
     74     expression() const { return m_xpr; }
     75 
     76   protected:
     77     typename XprType::Nested m_xpr;
     78     const PaddingDimensions m_padding_dims;
     79     const Scalar m_padding_value;
     80 };
     81 
     82 
     83 // Eval as rvalue
     84 template<typename PaddingDimensions, typename ArgType, typename Device>
     85 struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device>
     86 {
     87   typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
     88   typedef typename XprType::Index Index;
     89   static const int NumDims = internal::array_size<PaddingDimensions>::value;
     90   typedef DSizes<Index, NumDims> Dimensions;
     91   typedef typename XprType::Scalar Scalar;
     92   typedef typename XprType::CoeffReturnType CoeffReturnType;
     93   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
     94   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
     95   typedef StorageMemory<CoeffReturnType, Device> Storage;
     96   typedef typename Storage::Type EvaluatorPointerType;
     97 
     98   enum {
     99     IsAligned         = true,
    100     PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
    101     BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
    102     PreferBlockAccess = true,
    103     Layout            = TensorEvaluator<ArgType, Device>::Layout,
    104     CoordAccess       = true,
    105     RawAccess         = false
    106   };
    107 
    108   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
    109 
    110   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
    111   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
    112   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
    113 
    114   typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
    115                                                      Layout, Index>
    116       TensorBlock;
    117   //===--------------------------------------------------------------------===//
    118 
    119   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    120       : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
    121   {
    122     // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
    123     // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
    124     // of 1 element first and then pad.
    125     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
    126 
    127     // Compute dimensions
    128     m_dimensions = m_impl.dimensions();
    129     for (int i = 0; i < NumDims; ++i) {
    130       m_dimensions[i] += m_padding[i].first + m_padding[i].second;
    131     }
    132     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
    133     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    134       m_inputStrides[0] = 1;
    135       m_outputStrides[0] = 1;
    136       for (int i = 1; i < NumDims; ++i) {
    137         m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
    138         m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
    139       }
    140       m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
    141     } else {
    142       m_inputStrides[NumDims - 1] = 1;
    143       m_outputStrides[NumDims] = 1;
    144       for (int i = NumDims - 2; i >= 0; --i) {
    145         m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
    146         m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
    147       }
    148       m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
    149     }
    150   }
    151 
    152   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
    153 
    154   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
    155     m_impl.evalSubExprsIfNeeded(NULL);
    156     return true;
    157   }
    158 
    159 #ifdef EIGEN_USE_THREADS
    160   template <typename EvalSubExprsCallback>
    161   EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
    162       EvaluatorPointerType, EvalSubExprsCallback done) {
    163     m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
    164   }
    165 #endif  // EIGEN_USE_THREADS
    166 
    167   EIGEN_STRONG_INLINE void cleanup() {
    168     m_impl.cleanup();
    169   }
    170 
    171   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
    172   {
    173     eigen_assert(index < dimensions().TotalSize());
    174     Index inputIndex = 0;
    175     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    176       EIGEN_UNROLL_LOOP
    177       for (int i = NumDims - 1; i > 0; --i) {
    178         const Index idx = index / m_outputStrides[i];
    179         if (isPaddingAtIndexForDim(idx, i)) {
    180           return m_paddingValue;
    181         }
    182         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
    183         index -= idx * m_outputStrides[i];
    184       }
    185       if (isPaddingAtIndexForDim(index, 0)) {
    186         return m_paddingValue;
    187       }
    188       inputIndex += (index - m_padding[0].first);
    189     } else {
    190       EIGEN_UNROLL_LOOP
    191       for (int i = 0; i < NumDims - 1; ++i) {
    192         const Index idx = index / m_outputStrides[i+1];
    193         if (isPaddingAtIndexForDim(idx, i)) {
    194           return m_paddingValue;
    195         }
    196         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
    197         index -= idx * m_outputStrides[i+1];
    198       }
    199       if (isPaddingAtIndexForDim(index, NumDims-1)) {
    200         return m_paddingValue;
    201       }
    202       inputIndex += (index - m_padding[NumDims-1].first);
    203     }
    204     return m_impl.coeff(inputIndex);
    205   }
    206 
    207   template<int LoadMode>
    208   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
    209   {
    210     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    211       return packetColMajor(index);
    212     }
    213     return packetRowMajor(index);
    214   }
    215 
    216   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
    217     TensorOpCost cost = m_impl.costPerCoeff(vectorized);
    218     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    219       EIGEN_UNROLL_LOOP
    220       for (int i = 0; i < NumDims; ++i)
    221         updateCostPerDimension(cost, i, i == 0);
    222     } else {
    223       EIGEN_UNROLL_LOOP
    224       for (int i = NumDims - 1; i >= 0; --i)
    225         updateCostPerDimension(cost, i, i == NumDims - 1);
    226     }
    227     return cost;
    228   }
    229 
    230   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    231   internal::TensorBlockResourceRequirements getResourceRequirements() const {
    232     const size_t target_size = m_device.lastLevelCacheSize();
    233     return internal::TensorBlockResourceRequirements::merge(
    234         internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
    235         m_impl.getResourceRequirements());
    236   }
    237 
    238   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
    239   block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
    240           bool /*root_of_expr_ast*/ = false) const {
    241     // If one of the dimensions is zero, return empty block view.
    242     if (desc.size() == 0) {
    243       return TensorBlock(internal::TensorBlockKind::kView, NULL,
    244                            desc.dimensions());
    245     }
    246 
    247     static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
    248     const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
    249 
    250     Index offset = desc.offset();
    251 
    252     // Compute offsets in the output tensor corresponding to the desc.offset().
    253     DSizes<Index, NumDims> output_offsets;
    254     for (int i = NumDims - 1; i > 0; --i) {
    255       const int dim = IsColMajor ? i : NumDims - i - 1;
    256       const int stride_dim = IsColMajor ? dim : dim + 1;
    257       output_offsets[dim] = offset / m_outputStrides[stride_dim];
    258       offset -= output_offsets[dim] * m_outputStrides[stride_dim];
    259     }
    260     output_offsets[inner_dim_idx] = offset;
    261 
    262     // Offsets in the input corresponding to output offsets.
    263     DSizes<Index, NumDims> input_offsets = output_offsets;
    264     for (int i = 0; i < NumDims; ++i) {
    265       const int dim = IsColMajor ? i : NumDims - i - 1;
    266       input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
    267     }
    268 
    269     // Compute offset in the input buffer (at this point it might be illegal and
    270     // point outside of the input buffer, because we don't check for negative
    271     // offsets, it will be autocorrected in the block iteration loop below).
    272     Index input_offset = 0;
    273     for (int i = 0; i < NumDims; ++i) {
    274       const int dim = IsColMajor ? i : NumDims - i - 1;
    275       input_offset += input_offsets[dim] * m_inputStrides[dim];
    276     }
    277 
    278     // Destination buffer and scratch buffer both indexed from 0 and have the
    279     // same dimensions as the requested block (for destination buffer this
    280     // property is guaranteed by `desc.destination()`).
    281     Index output_offset = 0;
    282     const DSizes<Index, NumDims> output_strides =
    283         internal::strides<Layout>(desc.dimensions());
    284 
    285     // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
    286     // dimensions, skipping innermost dimension. In theory it should be possible
    287     // to squeeze matching innermost dimensions, however in practice that did
    288     // not show any improvements in benchmarks. Also in practice first outer
    289     // dimension usually has padding, and will prevent squeezing.
    290 
    291     // Initialize output block iterator state. Dimension in this array are
    292     // always in inner_most -> outer_most order (col major layout).
    293     array<BlockIteratorState, NumDims - 1> it;
    294     for (int i = 0; i < NumDims - 1; ++i) {
    295       const int dim = IsColMajor ? i + 1 : NumDims - i - 2;
    296       it[i].count = 0;
    297       it[i].size = desc.dimension(dim);
    298 
    299       it[i].input_stride = m_inputStrides[dim];
    300       it[i].input_span = it[i].input_stride * (it[i].size - 1);
    301 
    302       it[i].output_stride = output_strides[dim];
    303       it[i].output_span = it[i].output_stride * (it[i].size - 1);
    304     }
    305 
    306     const Index input_inner_dim_size =
    307         static_cast<Index>(m_impl.dimensions()[inner_dim_idx]);
    308 
    309     // Total output size.
    310     const Index output_size = desc.size();
    311 
    312     // We will fill inner dimension of this size in the output. It might be
    313     // larger than the inner dimension in the input, so we might have to pad
    314     // before/after we copy values from the input inner dimension.
    315     const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
    316 
    317     // How many values to fill with padding BEFORE reading from the input inner
    318     // dimension.
    319     const Index output_inner_pad_before_size =
    320         input_offsets[inner_dim_idx] < 0
    321             ? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
    322                            output_inner_dim_size)
    323             : 0;
    324 
    325     // How many values we can actually copy from the input inner dimension.
    326     const Index output_inner_copy_size = numext::mini(
    327         // Want to copy from input.
    328         (output_inner_dim_size - output_inner_pad_before_size),
    329         // Can copy from input.
    330         numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] +
    331                                              output_inner_pad_before_size),
    332                      Index(0)));
    333 
    334     eigen_assert(output_inner_copy_size >= 0);
    335 
    336     // How many values to fill with padding AFTER reading from the input inner
    337     // dimension.
    338     const Index output_inner_pad_after_size =
    339         (output_inner_dim_size - output_inner_copy_size -
    340          output_inner_pad_before_size);
    341 
    342     // Sanity check, sum of all sizes must be equal to the output size.
    343     eigen_assert(output_inner_dim_size ==
    344                  (output_inner_pad_before_size + output_inner_copy_size +
    345                   output_inner_pad_after_size));
    346 
    347     // Keep track of current coordinates and padding in the output.
    348     DSizes<Index, NumDims> output_coord = output_offsets;
    349     DSizes<Index, NumDims> output_padded;
    350     for (int i = 0; i < NumDims; ++i) {
    351       const int dim = IsColMajor ? i : NumDims - i - 1;
    352       output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
    353     }
    354 
    355     typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
    356 
    357     // Prepare storage for the materialized padding result.
    358     const typename TensorBlock::Storage block_storage =
    359         TensorBlock::prepareStorage(desc, scratch);
    360 
    361     // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a
    362     // single logical inner dimension.
    363 
    364     // When possible we squeeze writes for the innermost (only if non-padded)
    365     // dimension with the first padded dimension. This allows to reduce the
    366     // number of calls to LinCopy and better utilize vector instructions.
    367     const bool squeeze_writes =
    368         NumDims > 1 &&
    369         // inner dimension is not padded
    370         (input_inner_dim_size == m_dimensions[inner_dim_idx]) &&
    371         // and equal to the block inner dimension
    372         (input_inner_dim_size == output_inner_dim_size);
    373 
    374     const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1;
    375 
    376     // Maximum coordinate on a squeeze dimension that we can write to.
    377     const Index squeeze_max_coord =
    378         squeeze_writes ? numext::mini(
    379                              // max non-padded element in the input
    380                              static_cast<Index>(m_dimensions[squeeze_dim] -
    381                                                 m_padding[squeeze_dim].second),
    382                              // max element in the output buffer
    383                              static_cast<Index>(output_offsets[squeeze_dim] +
    384                                                 desc.dimension(squeeze_dim)))
    385                        : static_cast<Index>(0);
    386 
    387     // Iterate copying data from `m_impl.data()` to the output buffer.
    388     for (Index size = 0; size < output_size;) {
    389       // Detect if we are in the padded region (exclude innermost dimension).
    390       bool is_padded = false;
    391       for (int j = 1; j < NumDims; ++j) {
    392         const int dim = IsColMajor ? j : NumDims - j - 1;
    393         is_padded = output_padded[dim];
    394         if (is_padded) break;
    395       }
    396 
    397       if (is_padded) {
    398         // Fill single innermost dimension with padding value.
    399         size += output_inner_dim_size;
    400 
    401         LinCopy::template Run<LinCopy::Kind::FillLinear>(
    402             typename LinCopy::Dst(output_offset, 1, block_storage.data()),
    403             typename LinCopy::Src(0, 0, &m_paddingValue),
    404             output_inner_dim_size);
    405 
    406 
    407       } else if (squeeze_writes) {
    408         // Squeeze multiple reads from innermost dimensions.
    409         const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim];
    410         size += output_inner_dim_size * squeeze_num;
    411 
    412         // Copy `squeeze_num` inner dimensions from input to output.
    413         LinCopy::template Run<LinCopy::Kind::Linear>(
    414             typename LinCopy::Dst(output_offset, 1, block_storage.data()),
    415             typename LinCopy::Src(input_offset, 1, m_impl.data()),
    416             output_inner_dim_size * squeeze_num);
    417 
    418         // Update iteration state for only `squeeze_num - 1` processed inner
    419         // dimensions, because we have another iteration state update at the end
    420         // of the loop that will update iteration state for the last inner
    421         // processed dimension.
    422         it[0].count += (squeeze_num - 1);
    423         input_offset += it[0].input_stride * (squeeze_num - 1);
    424         output_offset += it[0].output_stride * (squeeze_num - 1);
    425         output_coord[squeeze_dim] += (squeeze_num - 1);
    426 
    427       } else {
    428         // Single read from innermost dimension.
    429         size += output_inner_dim_size;
    430 
    431         {  // Fill with padding before copying from input inner dimension.
    432           const Index out = output_offset;
    433 
    434           LinCopy::template Run<LinCopy::Kind::FillLinear>(
    435               typename LinCopy::Dst(out, 1, block_storage.data()),
    436               typename LinCopy::Src(0, 0, &m_paddingValue),
    437               output_inner_pad_before_size);
    438         }
    439 
    440         {  // Copy data from input inner dimension.
    441           const Index out = output_offset + output_inner_pad_before_size;
    442           const Index in = input_offset + output_inner_pad_before_size;
    443 
    444           eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
    445 
    446           LinCopy::template Run<LinCopy::Kind::Linear>(
    447               typename LinCopy::Dst(out, 1, block_storage.data()),
    448               typename LinCopy::Src(in, 1, m_impl.data()),
    449               output_inner_copy_size);
    450         }
    451 
    452         {  // Fill with padding after copying from input inner dimension.
    453           const Index out = output_offset + output_inner_pad_before_size +
    454                             output_inner_copy_size;
    455 
    456           LinCopy::template Run<LinCopy::Kind::FillLinear>(
    457               typename LinCopy::Dst(out, 1, block_storage.data()),
    458               typename LinCopy::Src(0, 0, &m_paddingValue),
    459               output_inner_pad_after_size);
    460         }
    461       }
    462 
    463       for (int j = 0; j < NumDims - 1; ++j) {
    464         const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
    465 
    466         if (++it[j].count < it[j].size) {
    467           input_offset += it[j].input_stride;
    468           output_offset += it[j].output_stride;
    469           output_coord[dim] += 1;
    470           output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
    471           break;
    472         }
    473         it[j].count = 0;
    474         input_offset -= it[j].input_span;
    475         output_offset -= it[j].output_span;
    476         output_coord[dim] -= it[j].size - 1;
    477         output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
    478       }
    479     }
    480 
    481     return block_storage.AsTensorMaterializedBlock();
    482   }
    483 
    484   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
    485 
    486 #ifdef EIGEN_USE_SYCL
    487   // binding placeholder accessors to a command group handler for SYCL
    488   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    489     m_impl.bind(cgh);
    490   }
    491 #endif
    492 
    493  private:
    494   struct BlockIteratorState {
    495     BlockIteratorState()
    496         : count(0),
    497           size(0),
    498           input_stride(0),
    499           input_span(0),
    500           output_stride(0),
    501           output_span(0) {}
    502 
    503     Index count;
    504     Index size;
    505     Index input_stride;
    506     Index input_span;
    507     Index output_stride;
    508     Index output_span;
    509   };
    510 
    511   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
    512       Index index, int dim_index) const {
    513 #if defined(EIGEN_HAS_INDEX_LIST)
    514     return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
    515             index < m_padding[dim_index].first) ||
    516         (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
    517          index >= m_dimensions[dim_index] - m_padding[dim_index].second);
    518 #else
    519     return (index < m_padding[dim_index].first) ||
    520            (index >= m_dimensions[dim_index] - m_padding[dim_index].second);
    521 #endif
    522   }
    523 
    524   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
    525       int dim_index) const {
    526 #if defined(EIGEN_HAS_INDEX_LIST)
    527     return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
    528 #else
    529     EIGEN_UNUSED_VARIABLE(dim_index);
    530     return false;
    531 #endif
    532   }
    533 
    534   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
    535       int dim_index) const {
    536 #if defined(EIGEN_HAS_INDEX_LIST)
    537     return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
    538 #else
    539     EIGEN_UNUSED_VARIABLE(dim_index);
    540     return false;
    541 #endif
    542   }
    543 
    544 
    545   void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
    546     const double in = static_cast<double>(m_impl.dimensions()[i]);
    547     const double out = in + m_padding[i].first + m_padding[i].second;
    548     if (out == 0)
    549       return;
    550     const double reduction = in / out;
    551     cost *= reduction;
    552     if (first) {
    553       cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
    554                     reduction * (1 * TensorOpCost::AddCost<Index>()));
    555     } else {
    556       cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
    557                                  2 * TensorOpCost::MulCost<Index>() +
    558                     reduction * (2 * TensorOpCost::MulCost<Index>() +
    559                                  1 * TensorOpCost::DivCost<Index>()));
    560     }
    561   }
    562 
    563  protected:
    564 
    565   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
    566   {
    567     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    568     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
    569 
    570     const Index initialIndex = index;
    571     Index inputIndex = 0;
    572     EIGEN_UNROLL_LOOP
    573     for (int i = NumDims - 1; i > 0; --i) {
    574       const Index firstIdx = index;
    575       const Index lastIdx = index + PacketSize - 1;
    576       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
    577       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
    578       const Index lastPaddedRight = m_outputStrides[i+1];
    579 
    580       if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
    581         // all the coefficient are in the padding zone.
    582         return internal::pset1<PacketReturnType>(m_paddingValue);
    583       }
    584       else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
    585         // all the coefficient are in the padding zone.
    586         return internal::pset1<PacketReturnType>(m_paddingValue);
    587       }
    588       else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
    589         // all the coefficient are between the 2 padding zones.
    590         const Index idx = index / m_outputStrides[i];
    591         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
    592         index -= idx * m_outputStrides[i];
    593       }
    594       else {
    595         // Every other case
    596         return packetWithPossibleZero(initialIndex);
    597       }
    598     }
    599 
    600     const Index lastIdx = index + PacketSize - 1;
    601     const Index firstIdx = index;
    602     const Index lastPaddedLeft = m_padding[0].first;
    603     const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
    604     const Index lastPaddedRight = m_outputStrides[1];
    605 
    606     if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
    607       // all the coefficient are in the padding zone.
    608       return internal::pset1<PacketReturnType>(m_paddingValue);
    609     }
    610     else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
    611       // all the coefficient are in the padding zone.
    612       return internal::pset1<PacketReturnType>(m_paddingValue);
    613     }
    614     else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
    615       // all the coefficient are between the 2 padding zones.
    616       inputIndex += (index - m_padding[0].first);
    617       return m_impl.template packet<Unaligned>(inputIndex);
    618     }
    619     // Every other case
    620     return packetWithPossibleZero(initialIndex);
    621   }
    622 
    623   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
    624   {
    625     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    626     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
    627 
    628     const Index initialIndex = index;
    629     Index inputIndex = 0;
    630     EIGEN_UNROLL_LOOP
    631     for (int i = 0; i < NumDims - 1; ++i) {
    632       const Index firstIdx = index;
    633       const Index lastIdx = index + PacketSize - 1;
    634       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
    635       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
    636       const Index lastPaddedRight = m_outputStrides[i];
    637 
    638       if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
    639         // all the coefficient are in the padding zone.
    640         return internal::pset1<PacketReturnType>(m_paddingValue);
    641       }
    642       else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
    643         // all the coefficient are in the padding zone.
    644         return internal::pset1<PacketReturnType>(m_paddingValue);
    645       }
    646       else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
    647         // all the coefficient are between the 2 padding zones.
    648         const Index idx = index / m_outputStrides[i+1];
    649         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
    650         index -= idx * m_outputStrides[i+1];
    651       }
    652       else {
    653         // Every other case
    654         return packetWithPossibleZero(initialIndex);
    655       }
    656     }
    657 
    658     const Index lastIdx = index + PacketSize - 1;
    659     const Index firstIdx = index;
    660     const Index lastPaddedLeft = m_padding[NumDims-1].first;
    661     const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
    662     const Index lastPaddedRight = m_outputStrides[NumDims-1];
    663 
    664     if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) {
    665       // all the coefficient are in the padding zone.
    666       return internal::pset1<PacketReturnType>(m_paddingValue);
    667     }
    668     else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
    669       // all the coefficient are in the padding zone.
    670       return internal::pset1<PacketReturnType>(m_paddingValue);
    671     }
    672     else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
    673       // all the coefficient are between the 2 padding zones.
    674       inputIndex += (index - m_padding[NumDims-1].first);
    675       return m_impl.template packet<Unaligned>(inputIndex);
    676     }
    677     // Every other case
    678     return packetWithPossibleZero(initialIndex);
    679   }
    680 
    681   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
    682   {
    683     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
    684     EIGEN_UNROLL_LOOP
    685     for (int i = 0; i < PacketSize; ++i) {
    686       values[i] = coeff(index+i);
    687     }
    688     PacketReturnType rslt = internal::pload<PacketReturnType>(values);
    689     return rslt;
    690   }
    691 
    692   Dimensions m_dimensions;
    693   array<Index, NumDims+1> m_outputStrides;
    694   array<Index, NumDims> m_inputStrides;
    695   TensorEvaluator<ArgType, Device> m_impl;
    696   PaddingDimensions m_padding;
    697 
    698   Scalar m_paddingValue;
    699 
    700   const Device EIGEN_DEVICE_REF m_device;
    701 };
    702 
    703 
    704 
    705 
    706 } // end namespace Eigen
    707 
    708 #endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H