cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TensorShuffling.h (18256B)


      1 // This file is part of Eigen, a lightweight C++ template library
      2 // for linear algebra.
      3 //
      4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
      5 //
      6 // This Source Code Form is subject to the terms of the Mozilla
      7 // Public License v. 2.0. If a copy of the MPL was not distributed
      8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
      9 
     10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
     11 #define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
     12 
     13 namespace Eigen {
     14 
     15 /** \class TensorShuffling
     16   * \ingroup CXX11_Tensor_Module
     17   *
     18   * \brief Tensor shuffling class.
     19   *
     20   *
     21   */
     22 namespace internal {
     23 template<typename Shuffle, typename XprType>
     24 struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
     25 {
     26   typedef typename XprType::Scalar Scalar;
     27   typedef traits<XprType> XprTraits;
     28   typedef typename XprTraits::StorageKind StorageKind;
     29   typedef typename XprTraits::Index Index;
     30   typedef typename XprType::Nested Nested;
     31   typedef typename remove_reference<Nested>::type _Nested;
     32   static const int NumDimensions = XprTraits::NumDimensions;
     33   static const int Layout = XprTraits::Layout;
     34   typedef typename XprTraits::PointerType PointerType;
     35 };
     36 
     37 template<typename Shuffle, typename XprType>
     38 struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
     39 {
     40   typedef const TensorShufflingOp<Shuffle, XprType>& type;
     41 };
     42 
     43 template<typename Shuffle, typename XprType>
     44 struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
     45 {
     46   typedef TensorShufflingOp<Shuffle, XprType> type;
     47 };
     48 
     49 }  // end namespace internal
     50 
     51 
     52 
     53 template<typename Shuffle, typename XprType>
     54 class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
     55 {
     56   public:
     57     typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
     58     typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
     59     typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
     60     typedef typename XprType::CoeffReturnType CoeffReturnType;
     61     typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
     62     typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
     63     typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
     64 
     65     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
     66       : m_xpr(expr), m_shuffle(shfl) {}
     67 
     68     EIGEN_DEVICE_FUNC
     69     const Shuffle& shufflePermutation() const { return m_shuffle; }
     70 
     71     EIGEN_DEVICE_FUNC
     72     const typename internal::remove_all<typename XprType::Nested>::type&
     73     expression() const { return m_xpr; }
     74 
     75     EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
     76 
     77 
     78   protected:
     79     typename XprType::Nested m_xpr;
     80     const Shuffle m_shuffle;
     81 };
     82 
     83 
     84 // Eval as rvalue
     85 template<typename Shuffle, typename ArgType, typename Device>
     86 struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
     87 {
     88   typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
     89   typedef TensorShufflingOp<Shuffle, ArgType> XprType;
     90   typedef typename XprType::Index Index;
     91   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
     92   typedef DSizes<Index, NumDims> Dimensions;
     93   typedef typename XprType::Scalar Scalar;
     94   typedef typename XprType::CoeffReturnType CoeffReturnType;
     95   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
     96   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
     97   typedef StorageMemory<CoeffReturnType, Device> Storage;
     98   typedef typename Storage::Type EvaluatorPointerType;
     99 
    100   enum {
    101     IsAligned         = false,
    102     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
    103     BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
    104     PreferBlockAccess = true,
    105     Layout            = TensorEvaluator<ArgType, Device>::Layout,
    106     CoordAccess       = false,  // to be implemented
    107     RawAccess         = false
    108   };
    109 
    110   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
    111 
    112   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
    113   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
    114   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
    115 
    116   typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
    117                                                      Layout, Index>
    118       TensorBlock;
    119   //===--------------------------------------------------------------------===//
    120 
    121   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    122       : m_device(device),
    123         m_impl(op.expression(), device)
    124   {
    125     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
    126     const Shuffle& shuffle = op.shufflePermutation();
    127     m_is_identity = true;
    128     for (int i = 0; i < NumDims; ++i) {
    129       m_shuffle[i] = static_cast<int>(shuffle[i]);
    130       m_dimensions[i] = input_dims[shuffle[i]];
    131       m_inverseShuffle[shuffle[i]] = i;
    132       if (m_is_identity && shuffle[i] != i) {
    133         m_is_identity = false;
    134       }
    135     }
    136 
    137     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    138       m_unshuffledInputStrides[0] = 1;
    139       m_outputStrides[0] = 1;
    140 
    141       for (int i = 1; i < NumDims; ++i) {
    142         m_unshuffledInputStrides[i] =
    143             m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
    144         m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
    145         m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
    146                   m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
    147       }
    148     } else {
    149       m_unshuffledInputStrides[NumDims - 1] = 1;
    150       m_outputStrides[NumDims - 1] = 1;
    151       for (int i = NumDims - 2; i >= 0; --i) {
    152         m_unshuffledInputStrides[i] =
    153             m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
    154         m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
    155         m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
    156                   m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
    157       }
    158     }
    159 
    160     for (int i = 0; i < NumDims; ++i) {
    161       m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
    162     }
    163   }
    164 
    165   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
    166 
    167   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
    168     m_impl.evalSubExprsIfNeeded(NULL);
    169     return true;
    170   }
    171 
    172 #ifdef EIGEN_USE_THREADS
    173   template <typename EvalSubExprsCallback>
    174   EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
    175       EvaluatorPointerType, EvalSubExprsCallback done) {
    176     m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
    177   }
    178 #endif  // EIGEN_USE_THREADS
    179 
    180   EIGEN_STRONG_INLINE void cleanup() {
    181     m_impl.cleanup();
    182   }
    183 
    184   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
    185   {
    186     if (m_is_identity) {
    187       return m_impl.coeff(index);
    188     } else {
    189       return m_impl.coeff(srcCoeff(index));
    190     }
    191   }
    192 
    193   template <int LoadMode, typename Self, bool ImplPacketAccess>
    194   struct PacketLoader {
    195     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    196     static PacketReturnType Run(const Self& self, Index index) {
    197       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
    198       EIGEN_UNROLL_LOOP
    199       for (int i = 0; i < PacketSize; ++i) {
    200         values[i] = self.coeff(index + i);
    201       }
    202       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
    203       return rslt;
    204     }
    205   };
    206 
    207   template<int LoadMode, typename Self>
    208   struct PacketLoader<LoadMode, Self, true> {
    209     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    210     static PacketReturnType Run(const Self& self, Index index) {
    211       if (self.m_is_identity) {
    212         return self.m_impl.template packet<LoadMode>(index);
    213       } else {
    214         EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
    215         EIGEN_UNROLL_LOOP
    216         for (int i = 0; i < PacketSize; ++i) {
    217           values[i] = self.coeff(index + i);
    218         }
    219         PacketReturnType rslt = internal::pload<PacketReturnType>(values);
    220         return rslt;
    221       }
    222     }
    223   };
    224 
    225   template<int LoadMode>
    226   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
    227   {
    228     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    229         eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
    230     return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
    231   }
    232 
    233   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    234   internal::TensorBlockResourceRequirements getResourceRequirements() const {
    235     static const int inner_dim =
    236         Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
    237 
    238     const size_t target_size = m_device.firstLevelCacheSize();
    239     const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
    240 
    241     // Shuffled inner dimensions leads to a random memory access, which is not
    242     // captured by default cost model bytes loaded/stored. We add this cost
    243     // explicitly. The number of cycles picked based on the benchmarks.
    244     // TODO(ezhulenev): This number was picked based on a very questionable
    245     // benchmarks, add benchmarks that are representative of real workloads.
    246     using BlockRequirements = internal::TensorBlockResourceRequirements;
    247     if (inner_dim_shuffled) {
    248       return BlockRequirements::uniform<Scalar>(target_size)
    249           .addCostPerCoeff({0, 0, NumDims * 28});
    250     } else {
    251       return BlockRequirements::skewed<Scalar>(target_size);
    252     }
    253   }
    254 
    255   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
    256   block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
    257           bool root_of_expr_ast = false) const {
    258     assert(m_impl.data() != NULL);
    259 
    260     typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
    261         TensorBlockIO;
    262     typedef typename TensorBlockIO::Dst TensorBlockIODst;
    263     typedef typename TensorBlockIO::Src TensorBlockIOSrc;
    264 
    265     const typename TensorBlock::Storage block_storage =
    266         TensorBlock::prepareStorage(
    267             desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
    268 
    269     typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
    270     TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
    271 
    272     TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
    273                          block_storage.data());
    274 
    275     typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
    276     TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
    277 
    278     return block_storage.AsTensorMaterializedBlock();
    279   }
    280 
    281   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
    282     const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
    283                                 NumDims * (2 * TensorOpCost::AddCost<Index>() +
    284                                            2 * TensorOpCost::MulCost<Index>() +
    285                                            TensorOpCost::DivCost<Index>());
    286     return m_impl.costPerCoeff(vectorized) +
    287            TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
    288   }
    289 
    290   EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
    291 
    292 #ifdef EIGEN_USE_SYCL
    293    // binding placeholder accessors to a command group handler for SYCL
    294   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
    295     m_impl.bind(cgh);
    296   }
    297 #endif
    298  protected:
    299   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
    300       Index input_index,
    301       const DSizes<Index, NumDims>& input_block_strides,
    302       const DSizes<Index, NumDims>& output_block_strides,
    303       const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
    304     Index output_index = 0;
    305     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    306       for (int i = NumDims - 1; i > 0; --i) {
    307         const Index idx = input_index / fast_input_block_strides[i];
    308         output_index += idx * output_block_strides[m_inverseShuffle[i]];
    309         input_index -= idx * input_block_strides[i];
    310       }
    311       return output_index + input_index *
    312           output_block_strides[m_inverseShuffle[0]];
    313     } else {
    314       for (int i = 0; i < NumDims - 1; ++i) {
    315         const Index idx = input_index / fast_input_block_strides[i];
    316         output_index += idx * output_block_strides[m_inverseShuffle[i]];
    317         input_index -= idx * input_block_strides[i];
    318       }
    319       return output_index + input_index *
    320           output_block_strides[m_inverseShuffle[NumDims - 1]];
    321     }
    322   }
    323 
    324   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
    325     Index inputIndex = 0;
    326     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
    327       for (int i = NumDims - 1; i > 0; --i) {
    328         const Index idx = index / m_fastOutputStrides[i];
    329         inputIndex += idx * m_inputStrides[i];
    330         index -= idx * m_outputStrides[i];
    331       }
    332       return inputIndex + index * m_inputStrides[0];
    333     } else {
    334       for (int i = 0; i < NumDims - 1; ++i) {
    335         const Index idx = index / m_fastOutputStrides[i];
    336         inputIndex += idx * m_inputStrides[i];
    337         index -= idx * m_outputStrides[i];
    338       }
    339       return inputIndex + index * m_inputStrides[NumDims - 1];
    340     }
    341   }
    342 
    343   Dimensions m_dimensions;
    344   bool m_is_identity;
    345   array<int, NumDims> m_shuffle;
    346   array<Index, NumDims> m_inverseShuffle;  // TODO(ezhulenev): Make it int type.
    347   array<Index, NumDims> m_outputStrides;
    348   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
    349   array<Index, NumDims> m_inputStrides;
    350   array<Index, NumDims> m_unshuffledInputStrides;
    351 
    352   const Device EIGEN_DEVICE_REF m_device;
    353   TensorEvaluator<ArgType, Device> m_impl;
    354 };
    355 
    356 
    357 // Eval as lvalue
    358 template<typename Shuffle, typename ArgType, typename Device>
    359 struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
    360     : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
    361 {
    362   typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
    363 
    364   typedef TensorShufflingOp<Shuffle, ArgType> XprType;
    365   typedef typename XprType::Index Index;
    366   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
    367   typedef DSizes<Index, NumDims> Dimensions;
    368   typedef typename XprType::Scalar Scalar;
    369   typedef typename XprType::CoeffReturnType CoeffReturnType;
    370   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
    371   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
    372 
    373   enum {
    374     IsAligned         = false,
    375     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
    376     BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
    377     PreferBlockAccess = true,
    378     Layout            = TensorEvaluator<ArgType, Device>::Layout,
    379     RawAccess         = false
    380   };
    381 
    382   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
    383 
    384   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
    385   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
    386   //===--------------------------------------------------------------------===//
    387 
    388   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
    389       : Base(op, device)
    390   { }
    391 
    392   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
    393   {
    394     return this->m_impl.coeffRef(this->srcCoeff(index));
    395   }
    396 
    397   template <int StoreMode> EIGEN_STRONG_INLINE
    398   void writePacket(Index index, const PacketReturnType& x)
    399   {
    400     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
    401 
    402     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
    403     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
    404     EIGEN_UNROLL_LOOP
    405     for (int i = 0; i < PacketSize; ++i) {
    406       this->coeffRef(index+i) = values[i];
    407     }
    408   }
    409 
    410   template <typename TensorBlock>
    411   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
    412       const TensorBlockDesc& desc, const TensorBlock& block) {
    413     eigen_assert(this->m_impl.data() != NULL);
    414 
    415     typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
    416         TensorBlockIO;
    417     typedef typename TensorBlockIO::Dst TensorBlockIODst;
    418     typedef typename TensorBlockIO::Src TensorBlockIOSrc;
    419 
    420     const Scalar* block_buffer = block.data();
    421 
    422     // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
    423     // expression with coefficient and packet access as `src`.
    424     void* mem = NULL;
    425     if (block_buffer == NULL) {
    426       mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
    427       ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
    428 
    429       typedef internal::TensorBlockAssignment<
    430           ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
    431           TensorBlockAssignment;
    432 
    433       TensorBlockAssignment::Run(
    434           TensorBlockAssignment::target(
    435               desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
    436               buf),
    437           block.expr());
    438 
    439       block_buffer = buf;
    440     }
    441 
    442     // Read from block.
    443     TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
    444                          block_buffer);
    445 
    446     // Write to the output buffer.
    447     typename TensorBlockIO::Dimensions output_strides(
    448         this->m_unshuffledInputStrides);
    449     typename TensorBlockIO::Dimensions output_dimensions;
    450     for (int i = 0; i < NumDims; ++i) {
    451       output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
    452     }
    453     TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
    454                          this->srcCoeff(desc.offset()));
    455 
    456     // Reorder dimensions according to the shuffle.
    457     typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
    458     for (int i = 0; i < NumDims; ++i) {
    459       dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
    460     }
    461     TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
    462 
    463     // Deallocate temporary buffer used for the block materialization.
    464     if (mem != NULL) this->m_device.deallocate(mem);
    465   }
    466 };
    467 
    468 
    469 } // end namespace Eigen
    470 
    471 #endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H